diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..23c6316b --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,47 @@ +cmake_minimum_required(VERSION 3.8) + +set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") + +project(tarantella VERSION 0.6.0) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_FLAGS "-O3 -Wall -Wextra -Werror") + +option(LINK_IB "Defines whether to link against Infiniband drivers [default: disabled]" off) +option(ENABLE_TESTING "Compile tests [default: disabled]" off) +option(BUILD_DOCS "Build documentation [default: disabled]" off) + +set(SRC_DIR "${CMAKE_SOURCE_DIR}/src") +set(CMAKE_BUILD_DIR "${CMAKE_SOURCE_DIR}/build") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + +set(INSTALL_LIB_DIR "${CMAKE_INSTALL_PREFIX}/lib/tarantella") +set(INSTALL_BIN_DIR "${CMAKE_INSTALL_PREFIX}/bin") + +find_package(GPI2 REQUIRED) +find_package(pybind11 REQUIRED) +find_package(Tensorflow REQUIRED) + +add_subdirectory(${SRC_DIR}) +add_subdirectory(${SRC_DIR}/gpi_comm_lib/gpi) +add_subdirectory(${SRC_DIR}/gpi_comm_lib/collectives) +add_subdirectory(${SRC_DIR}/gpi_comm_lib) +add_subdirectory(${SRC_DIR}/gpi_comm_lib/tf_ops) + +if (BUILD_DOCS) + find_package(Sphinx) + add_subdirectory(docs) +endif() + +if (ENABLE_TESTING) + find_package(Boost 1.61 REQUIRED COMPONENTS + unit_test_framework) + find_package(PythonModules REQUIRED COMPONENTS + numpy + pytest) + enable_testing() + set(SLEEP_TIME_AFTER_TEST 4) + add_subdirectory(${CMAKE_SOURCE_DIR}/test) +endif() + diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..67da183f --- /dev/null +++ b/LICENSE @@ -0,0 +1,101 @@ +TARANTELLA END USER LICENSE AGREEMENT +October 21, 2020 + +PLEASE READ THIS LICENSE AGREEMENT CAREFULLY. BY USING THE SOFTWARE TARANTELLA YOU +ACCEPT ALL TERMS OF THE LICENSE AGREEMENT. IF YOU DO NOT AGREE TO THE TERMS OF +THIS LICENSE, DO NOT INSTALL, COPY, OR USE THE SOFTWARE. + +1.) DEFINITIONS + +1.1) LICENSOR: Fraunhofer Gesellschaft zur Foerderung der angewandten Forschung +e.V., Hansastr. 27c, 80686 Muenchen, Germany, as legal entity of Fraunhofer- +Institut fuer Techno- und Wirtschaftsmathematik, Fraunhofer-Platz 1, +67663 Kaiserslautern, Germany. + +1.2) LICENSEE: The user of Tarantella under this License Agreement. + +1.3) LICENSED SOFTWARE: The Software Tarantella in source code and object code form +including all executable programs. + +1.4) DOCUMENTATION: The Tarantella documentation, user's guide, e-mails and other explanatory +materials accompanying the LICENSED SOFTWARE in printed or electronic form. + +2.) OWNERSHIP / INTELLECTUAL PROPERTY RIGHTS + +LICENSEE acknowledges that ownership and all intellectual property rights +related to the LICENSED SOFTWARE and to the DOCUMENTATION, including patents, +copyright, company or trade secrets remain with the LICENSOR. + +LICENSEE promises to keep and not to modify the copyright notices of the +LICENSOR. + +3.) SCOPE OF LICENSE + +3.1) Provided LICENSEE accepts all terms of this License Agreement, LICENSEE +is granted a non-exclusive, non-assignable right to use the LICENSED SOFTWARE, +which means LICENSEE may use the software for an unrestricted number of users, +as well as use the accompanying DOCUMENTATION by the actual number of users. + +3.2) Without prior written consent of LICENSOR or an authorized partner, +LICENSEE may modify the source code and use the modified version of the LICENSED +SOFTWARE for internal use only. + +3.2.1) LICENSEE must inform users of modified versions about the fact that the +software differs from the original version. + +3.2.2) The LICENSED SOFTWARE and the modifications generated by LICENSEE shall +remain the property of LICENSOR and no rights, including but not limited to the +right to apply for industrial property rights, are granted to LICENSEE. + +3.3) Without prior written consent of LICENSOR or an authorized partner, +LICENSEE may not: +- use, copy or distribute the LICENSED SOFTWARE except as provided for under + sections 3.1 and 3.2. +- provide commercial turn-key solutions based on the LICENSED SOFTWARE or + commercial services for the LICENSED SOFTWARE to any third party. +- rent or lease the LICENSED SOFTWARE and DOCUMENTATION to any third party. +- modify, adapt, or translate the LICENSED SOFTWARE for any third party. + +3.4) The license under this License Agreement relates to the LICENSED SOFTWARE. + +4.) LIMITED WARRANTY AND LIABILITY + +4.1) LICENSOR confirms that the LICENSED SOFTWARE has been developed without +infringement of any rights of third parties, in particular patents, copyrights +or other intellectual property rights of third parties. Nevertheless LICENSOR +does not warrant that the use of the LICENSED SOFTWARE by LICENSEE does not +infringe any third party intellectual property rights. + +4.2) LICENSEE is aware that there is a risk that the LICENSED SOFTWARE might +damage the data or the computer of the LICENSEE or even other computers on the +network in unpredictable ways. The use of the LICENSED SOFTWARE is at the +exclusive risk of the LICENSEE. LICENSOR does not offer any warranty either +expressed or implied and is not liable for any damages resulting from the use of +the LICENSED SOFTWARE or DOCUMENTATION such as, but not limited to, data loss. + +4.3) Notwithstanding sections 4.1 and 4.2, the liability of the LICENSOR, its +legal representatives and employees resulting from breach of duty or tort is +restricted to damages caused intentionally or by gross negligence. In any case, +the liability under this section is limited by typical, foreseeable, direct +damages. The liability is unrestricted for damages of the body, life or health. + +5.) MISCELLANEOUS + +This License Agreement in English is the original one. The terms of this +Agreement can only be modified or amended in writing. In case of interpretation +controversies the terms of this Agreement shall prevail over the respective +terms of any other agreements. + +This Agreement is construed under the Law of the Federal Republic of Germany. +Therefore, any and all controversies resulting out of this Agreement shall be +resolved under the Law of the Federal Republic of Germany excluding the German +International Private Law Rules. The application of the UN-Convention of the +International Sales of Goods (CISG) is explicitly excluded. Exclusive venue of +jurisdiction for both parties shall be Munich, Germany. + +In case that one or several of the terms of this Agreement should be or become +invalid or unenforceable, the validity of the other terms shall remain +unaffected. In such a case, the parties shall replace the invalid or +unenforceable condition by another legally effective provision meeting the +purpose of the abolished provision to the greatest extent. The same applies in +case of a gap of regulation. diff --git a/README.md b/README.md new file mode 100644 index 00000000..86c77563 --- /dev/null +++ b/README.md @@ -0,0 +1,41 @@ +![Tarantella](docs/source/pics/tnt_logo_text.png) + +

+ +Tarantella is an open-source, distributed Deep Learning framework built on top of TensorFlow 2, +providing scalable Deep Neural Network training on CPU and GPU compute clusters. + +Tarantella is easy-to-use, allows to re-use existing TensorFlow 2/Keras models, +and does not require any knowledge of parallel computing. + + +## Goals + +Tarantella is designed to meet the following goals: + +* strong scalability +* ease of use +* synchronous training scheme +* seamless integration with existing Keras models +* support for GPU and CPU systems + +## Install + +To build Tarantella from source, the following dependencies are required: + +* [TensorFlow 2](https://www.tensorflow.org/install) (supported versions TF2.2, TF2.1, TF2.0) +* [GPI-2](https://github.com/cc-hpc-itwm/GPI-2) (version 1.4.0) +* [pybind11](https://github.com/pybind/pybind11) (from version 2.4.3) +* C++ compiler (e.g., `gcc` from version 7.4.0) +* CMake (from version 3.8) + +Detailed installation instructions can be found in the [technical docs](https://tarantella.readthedocs.io/en/latest/installation.html). + +## Resources + +* [Official website](https://www.tarantella.org) +* [Technical documentation](https://tarantella.readthedocs.io/en/latest) + +## License + +[License](LICENSE) diff --git a/cmake/FindDNNL.cmake b/cmake/FindDNNL.cmake new file mode 100644 index 00000000..1f015465 --- /dev/null +++ b/cmake/FindDNNL.cmake @@ -0,0 +1,37 @@ +# Finds Intel DNNL library +# Martin Kuehn May 2020 + +find_path(DNNL_INCLUDE_DIR + NAMES dnnl.hpp + PATHS ${DNNL_ROOT} + ENV DNNL_ROOT + ${DNNL_ROOT_DIR} + ENV DNNL_ROOT_DIR + PATH_SUFFIXES include + DOC "DNNL header files" +) + +find_library(DNNL_LIBRARY dnnl + PATHS ${DNNL_ROOT} + ENV DNNL_ROOT + ${DNNL_ROOT_DIR} + ENV DNNL_ROOT_DIR + PATH_SUFFIXES lib lib64 + DOC "DNNL library files") + +#include (FindPackageHandleStandardArgs) +find_package_handle_standard_args(DNNL + DEFAULT_MSG + DNNL_LIBRARY + DNNL_INCLUDE_DIR) + +mark_as_advanced(DNNL_INCLUDE_DIR DNNL_LIBRARY) + +set(DNNL_INCLUDE_DIRS ${DNNL_INCLUDE_DIR}) +set(DNNL_LIBRARIES ${DNNL_LIBRARY}) + +if(DNNL_FOUND AND NOT TARGET dnnl) + add_library(dnnl SHARED IMPORTED GLOBAL) + target_include_directories(dnnl INTERFACE ${DNNL_INCLUDE_DIRS}) + set_property(TARGET dnnl PROPERTY IMPORTED_LOCATION ${DNNL_LIBRARIES}) +endif() diff --git a/cmake/FindGPI2.cmake b/cmake/FindGPI2.cmake new file mode 100644 index 00000000..d4bd3360 --- /dev/null +++ b/cmake/FindGPI2.cmake @@ -0,0 +1,133 @@ + +#[=======================================================================[.rst: +FindGPI2 +------- + +Finds the GPI2 library. + +Imported Targets +^^^^^^^^^^^^^^^^ + +This module provides the following imported targets, if found: + +``GPI2::GPI2`` + The GPI2 library + +Result Variables +^^^^^^^^^^^^^^^^ + +This will define the following variables: + +``GPI2_FOUND`` + True if the system has the GPI2 library. +``GPI2_INCLUDE_DIRS`` + Include directories needed to use GPI2. +``GPI2_LIBRARIES`` + Libraries needed to link to GPI2. +``GPI2_DBG_LIBRARIES`` + Libraries needed to link to the Debug version of GPI2. +``GPI2_GASPI_RUN`` + Path to ``gaspi_run``. + +Cache Variables +^^^^^^^^^^^^^^^ + +The following cache variables may also be set: + +``GPI2_INCLUDE_DIR`` + The directory containing ``gaspi.h``. +``GPI2_LIBRARY`` + The path to the GPI2 library. + +#]=======================================================================] + +set(GPI2_LIBRARY_NAME "GPI2") +set(GPI2_DBG_LIBRARY_NAME "GPI2-dbg") + +FIND_PROGRAM(GASPIRUN_PATH gaspi_run + PATHS + $ENV{PATH} + $ENV{LIB_DIR}/bin + /usr/local/bin/ + /usr/bin/ + ) + +IF (GASPIRUN_PATH) + get_filename_component(GASPIRUN_FOUND_HOME ${GASPIRUN_PATH} DIRECTORY) + get_filename_component(GPI2_INSTALLED_PATH ${GASPIRUN_FOUND_HOME} DIRECTORY) + get_filename_component(GPI2_INSTALLED_PATH ${GPI2_INSTALLED_PATH} REALPATH) +ENDIF(GASPIRUN_PATH) + +find_path (GPI2_INCLUDE_DIR GASPI.h + PATHS ${GPI2_DEFAULT_PATH} ${GPI2_INSTALLED_PATH} + PATHS ENV LD_LIBRARY_PATH DYLD_LIBRARY_PATH + PATH_SUFFIXES include) + +find_library (GPI2_DBG_LIBRARY ${GPI2_DBG_LIBRARY_NAME} + PATHS ${GPI2_DEFAULT_PATH} ${GPI2_INSTALLED_PATH} + PATHS ENV LD_LIBRARY_PATH DYLD_LIBRARY_PATH + PATH_SUFFIXES lib lib64) + +find_library (GPI2_LIBRARY ${GPI2_LIBRARY_NAME} + PATHS ${GPI2_DEFAULT_PATH} ${GPI2_INSTALLED_PATH} + PATHS ENV LD_LIBRARY_PATH DYLD_LIBRARY_PATH + PATH_SUFFIXES lib lib64) + +if (GPI2_DBG_LIBRARY) + message(STATUS "GPI2-dbg library path: ${GPI2_DBG_LIBRARY}" ) +else(GPI2_DBG_LIBRARY) + message(STATUS "GPI2-dbg library path: not found" ) +endif() + + +if (GPI2_LIBRARY) + message(STATUS "GPI2 library path: ${GPI2_LIBRARY}" ) +else(GPI2_LIBRARY) + message(STATUS "GPI2 library path: not found" ) +endif() + + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set GPI2_FOUND to TRUE +# if all listed variables are TRUE +find_package_handle_standard_args(GPI2 DEFAULT_MSG + GASPIRUN_PATH + GPI2_DBG_LIBRARY GPI2_LIBRARY) + +mark_as_advanced(GPI2_INCLUDE_DIR GASPIRUN_PATH + GPI2_DBG_LIBRARY GPI2_LIBRARY) +set(GPI2_INCLUDE_DIRS ${GPI2_INCLUDE_DIR} ) +set(GPI2_DBG_LIBRARIES ${GPI2_DBG_LIBRARY} ) +set(GPI2_LIBRARIES ${GPI2_LIBRARY} ) +set(GPI2_GASPI_RUN ${GASPIRUN_PATH}) + +message(STATUS "Found GPI2: " ${GPI2_FOUND}) + +if(GPI2_FOUND AND NOT TARGET GPI2::GPI2) + set(THREADS_PREFER_PTHREAD_FLAG ON) + find_package(Threads REQUIRED) + add_library(GPI2::GPI2 SHARED IMPORTED GLOBAL) + target_link_libraries(GPI2::GPI2 INTERFACE Threads::Threads) + target_include_directories(GPI2::GPI2 INTERFACE ${GPI2_INCLUDE_DIRS}) + set_property(TARGET GPI2::GPI2 PROPERTY IMPORTED_LOCATION ${GPI2_LIBRARIES}) + + add_library(GPI2::GPI2dbg SHARED IMPORTED GLOBAL) + target_link_libraries(GPI2::GPI2dbg INTERFACE Threads::Threads) + target_include_directories(GPI2::GPI2dbg INTERFACE ${GPI2_INCLUDE_DIRS}) + set_property(TARGET GPI2::GPI2dbg PROPERTY IMPORTED_LOCATION ${GPI2_DBG_LIBRARIES}) + + if (LINK_IB) + find_package(IBverbs) + + if (IBverbs_FOUND) + message (STATUS "GPI2: linking against ibverbs") + target_link_libraries(GPI2::GPI2 INTERFACE IBverbs::IBverbs) + target_link_libraries(GPI2::GPI2dbg INTERFACE IBverbs::IBverbs) + else() + message (FATAL_ERROR "GPI2: could not find ibverbs, disable Infiniband \ + support (-DLINK_IB=OFF) to load GPI-2") + endif() + else() + message (STATUS "GPI2: loading library without Infiniband support") + endif() +endif() diff --git a/cmake/FindIBverbs.cmake b/cmake/FindIBverbs.cmake new file mode 100644 index 00000000..aeb205e6 --- /dev/null +++ b/cmake/FindIBverbs.cmake @@ -0,0 +1,61 @@ + +#[=======================================================================[.rst: +FindIBverbs +------- + +Finds the IBverbs library. + +Imported Targets +^^^^^^^^^^^^^^^^ + +This module provides the following imported targets, if found: + +``IBverbs::IBverbs`` + The IBverbs library + +Result Variables +^^^^^^^^^^^^^^^^ + +This will define the following variables: + +``IBverbs_FOUND`` + True if the system has the IBverbs library. +``IBverbs_INCLUDE_DIRS`` + Include directories needed to use IBverbs. +``IBverbs_LIBRARIES`` + Libraries needed to link to IBverbs. + +Cache Variables +^^^^^^^^^^^^^^^ + +The following cache variables may also be set: + +``IBverbs_INCLUDE_DIR`` + The directory containing the public headers. +``IBverbs_LIBRARY`` + The path to the IBverbs library. + +#]=======================================================================] + +find_path(IBverbs_INCLUDE_DIR + NAMES infiniband/verbs.h + ) + +find_library(IBverbs_LIBRARY + NAMES ibverbs) + +include(FindPackageHandleStandardArgs) +# handle the QUIETLY and REQUIRED arguments and set IBverbs_FOUND to TRUE +# if all listed variables are TRUE +find_package_handle_standard_args(IBverbs DEFAULT_MSG + IBverbs_INCLUDE_DIR IBverbs_LIBRARY) + +mark_as_advanced(IBverbs_INCLUDE_DIR IBverbs_LIBRARY) +set(IBverbs_LIBRARIES ${IBverbs_LIBRARY}) +set(IBverbs_INCLUDE_DIRS ${IBverbs_INCLUDE_DIR}) + +if(IBverbs_FOUND AND NOT TARGET IBverbs::IBverbs) + add_library(IBverbs::IBverbs SHARED IMPORTED GLOBAL) + target_include_directories(IBverbs::IBverbs INTERFACE ${IBverbs_INCLUDE_DIRS}) + set_property(TARGET IBverbs::IBverbs PROPERTY IMPORTED_LOCATION ${IBverbs_LIBRARIES}) +endif() diff --git a/cmake/FindPythonModules.cmake b/cmake/FindPythonModules.cmake new file mode 100644 index 00000000..3cb0ed11 --- /dev/null +++ b/cmake/FindPythonModules.cmake @@ -0,0 +1,60 @@ +#[=======================================================================[.rst: +FindPythonModules +------- + +Finds installed PythonModules + +Result Variables +^^^^^^^^^^^^^^^^ + +This will define the following variables: + +``PythonModules_FOUND`` + True if all the required PythonModules could be loaded. +``PythonModules_modulename_FOUND`` + True if `modulename` could be loaded. +``Python_EXECUTABLE`` + Path to the Python executable. + +Cache Variables +^^^^^^^^^^^^^^^ + +The following cache variables may also be set: + +``GPI2_INCLUDE_DIR`` + The directory containing ``gaspi.h``. +``GPI2_LIBRARY`` + The path to the GPI2 library. + +#]=======================================================================] + +execute_process(COMMAND sh -c "which python" + OUTPUT_VARIABLE python_path + RESULT_VARIABLE result + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +if (result EQUAL "0" AND EXISTS ${python_path}) + set(Python_EXECUTABLE "${python_path}") +endif() + +set(PythonModules_FOUND TRUE) +if (Python_EXECUTABLE) + foreach (module IN LISTS PythonModules_FIND_COMPONENTS) + execute_process(COMMAND ${Python_EXECUTABLE} -c + "import ${module}" + RESULT_VARIABLE result + ERROR_QUIET OUTPUT_QUIET) + + if(result) + set (PythonModules_${module}_FOUND FALSE) + set (PythonModules_FOUND FALSE) + else() + set (PythonModules_${module}_FOUND TRUE) + endif() + endforeach() +endif() + +include (FindPackageHandleStandardArgs) +find_package_handle_standard_args (PythonModules + REQUIRED_VARS Python_EXECUTABLE PythonModules_FOUND + HANDLE_COMPONENTS) diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake new file mode 100644 index 00000000..406dc8bb --- /dev/null +++ b/cmake/FindSphinx.cmake @@ -0,0 +1,16 @@ +include(FindPackageHandleStandardArgs) + +find_program(Sphinx_EXECUTABLE + NAMES sphinx-build sphinx-build2 + DOC "Path to sphinx-build executable") + +find_package_handle_standard_args(Sphinx REQUIRED_VARS Sphinx_EXECUTABLE) + +if (Sphinx_FOUND) + mark_as_advanced(Sphinx_EXECUTABLE) +endif() + +if (Sphinx_FOUND AND NOT TARGET Sphinx::Sphinx) + add_executable(Sphinx::Sphinx IMPORTED) + set_property(TARGET Sphinx::Sphinx PROPERTY IMPORTED_LOCATION ${Sphinx_EXECUTABLE}) +endif() diff --git a/cmake/FindTensorflow.cmake b/cmake/FindTensorflow.cmake new file mode 100644 index 00000000..4afa4616 --- /dev/null +++ b/cmake/FindTensorflow.cmake @@ -0,0 +1,106 @@ + +#[=======================================================================[.rst: +FindTensorflow +------- + +Finds the Tensorflow package as described in: +https://www.tensorflow.org/guide/create_op#compile_the_op_using_your_system_compiler_tensorflow_binary_installation + + +Imported Targets +^^^^^^^^^^^^^^^^ + +This module provides the following imported targets, if found: + +``Tensorflow::Tensorflow`` + The Tensorflow library. + The target will set the CXX11_ABI_FLAG according to the ABI used to compile the TensorFlow library. + +Result Variables +^^^^^^^^^^^^^^^^ + +This will define the following variables: + +``Tensorflow_FOUND`` + True if the system has the Tensorflow library. +``Tensorflow_INCLUDE_DIRS`` + Include directories needed to use Tensorflow. +``Tensorflow_LIBRARIES`` + Libraries needed to link to Tensorflow. + +Cache Variables +^^^^^^^^^^^^^^^ + +The following cache variables may also be set: + +``Tensorflow_INCLUDE_DIR`` + The directory containing the Tensorflow library headers. +``Tensorflow_LIBRARY`` + The path to the Tensorflow library. + +#]=======================================================================] + +execute_process(COMMAND sh -c "which python" + OUTPUT_VARIABLE python_path + RESULT_VARIABLE result + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) +if (result EQUAL "0" AND EXISTS ${python_path}) + set(Python_EXECUTABLE "${python_path}") +endif() + +if (Python_EXECUTABLE) + execute_process(COMMAND ${Python_EXECUTABLE} -c + "import tensorflow as tf; print(tf.sysconfig.get_lib())" + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE result_tf_lib + OUTPUT_VARIABLE Tensorflow_LIBRARY_DIR + ERROR_QUIET) + + execute_process(COMMAND ${Python_EXECUTABLE} -c + "import tensorflow as tf; print(tf.sysconfig.get_include())" + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE result_tf_incl + OUTPUT_VARIABLE Tensorflow_INCLUDE_DIR + ERROR_QUIET) + + execute_process(COMMAND ${Python_EXECUTABLE} -c + "import tensorflow as tf; print(tf.sysconfig.CXX11_ABI_FLAG)" + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE result_tf_abi_flag + OUTPUT_VARIABLE Tensorflow_CXX11_ABI_FLAG + ERROR_QUIET) +endif() + +set(Tensorflow_LIBRARY_NAME libtensorflow_framework.so.2) +find_library (Tensorflow_LIBRARY ${Tensorflow_LIBRARY_NAME} + PATHS ${Tensorflow_LIBRARY_DIR} + PATHS ENV LD_LIBRARY_PATH DYLD_LIBRARY_PATH) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Tensorflow DEFAULT_MSG + Tensorflow_LIBRARY + Tensorflow_INCLUDE_DIR) + +mark_as_advanced(Tensorflow_INCLUDE_DIR Tensorflow_LIBRARY) +set(Tensorflow_INCLUDE_DIRS ${Tensorflow_INCLUDE_DIR} ) +set(Tensorflow_LIBRARIES ${Tensorflow_LIBRARY} ) + +message(STATUS "Found Tensorflow: " ${Tensorflow_FOUND}) + +if(Tensorflow_FOUND AND NOT TARGET tensorflow_framework) + add_library(Tensorflow::Tensorflow SHARED IMPORTED GLOBAL) + target_include_directories(Tensorflow::Tensorflow INTERFACE ${Tensorflow_INCLUDE_DIRS}) + set_property(TARGET Tensorflow::Tensorflow PROPERTY IMPORTED_LOCATION ${Tensorflow_LIBRARIES}) + + # Enable libraries that link against the TensorFlow library to use + # the correct value of the CXX11_ABI_FLAG. + # E.g., the official pip TensorFlow packages require CXX11_ABI_FLAG=0, + # whereas the conda packages set CXX11_ABI_FLAG=1. + if ("${result_tf_abi_flag}" EQUAL "0") + target_compile_definitions(Tensorflow::Tensorflow INTERFACE _GLIBCXX_USE_CXX11_ABI=${Tensorflow_CXX11_ABI_FLAG}) + endif() +endif() + + + diff --git a/cmake/add_macros.cmake b/cmake/add_macros.cmake new file mode 100644 index 00000000..12411693 --- /dev/null +++ b/cmake/add_macros.cmake @@ -0,0 +1,101 @@ + +macro (_default_if_unset VAR VAL) + if (NOT ${VAR}) + set (${VAR} ${VAL}) + endif() +endmacro() + +include (parse_arguments) + +function (extended_add_library) + set (options POSITION_INDEPENDENT PRECOMPILED INSTALL) + set (one_value_options NAME NAMESPACE TYPE INSTALL_DESTINATION) + set (multi_value_options + LIBRARIES SOURCES PUBLIC_HEADERS INCLUDE_DIRECTORIES RPATH + SYSTEM_INCLUDE_DIRECTORIES COMPILE_DEFINITIONS COMPILE_OPTIONS DEPENDS + ) + set (required_options NAME) + _parse_arguments (ARG "${options}" "${one_value_options}" "${multi_value_options}" "${required_options}" ${ARGN}) + + _default_if_unset (ARG_TYPE "STATIC") + _default_if_unset (ARG_INSTALL_DESTINATION "lib") + + if (ARG_NAMESPACE) + set (target_name "${ARG_NAMESPACE}-${ARG_NAME}") + else() + set (target_name "${ARG_NAME}") + endif() + + if (NOT (${ARG_TYPE} STREQUAL "STATIC" OR ${ARG_TYPE} STREQUAL "SHARED" OR ${ARG_TYPE} STREQUAL "MODULE")) + message (FATAL_ERROR "Bad library type: ${ARG_TYPE}") + endif() + + set (_scope_specifier) + if ((NOT ARG_SOURCES AND NOT ARG_MOC) OR ARG_PRECOMPILED) + set (_scope_specifier INTERFACE) + + add_library (${target_name} INTERFACE) + + if (ARG_PRECOMPILED) + if (ARG_TYPE STREQUAL "STATIC") + list (APPEND ARG_LIBRARIES "${CMAKE_CURRENT_SOURCE_DIR}/lib${target_name}.a") + else() + list (APPEND ARG_LIBRARIES "${CMAKE_CURRENT_SOURCE_DIR}/lib${target_name}.so") + endif() + endif() + + target_link_libraries (${target_name} INTERFACE ${ARG_LIBRARIES}) + else() + set (_scope_specifier PUBLIC) + + # _moc (${ARG_NAME}_mocced ${ARG_MOC}) + + add_library (${target_name} ${ARG_TYPE} #${${ARG_NAME}_mocced} + ${ARG_SOURCES}) + + target_link_libraries (${target_name} ${ARG_LIBRARIES}) + endif() + if (ARG_NAMESPACE) + add_library (${ARG_NAMESPACE}::${ARG_NAME} ALIAS ${target_name}) + endif() + if (ARG_PUBLIC_HEADERS) + set_property (TARGET ${target_name} APPEND + PROPERTY PUBLIC_HEADER ${ARG_PUBLIC_HEADERS} + ) + endif() + + if (ARG_SYSTEM_INCLUDE_DIRECTORIES) + target_include_directories (${target_name} SYSTEM + ${ARG_SYSTEM_INCLUDE_DIRECTORIES}) + endif() + if (ARG_INCLUDE_DIRECTORIES) + target_include_directories (${target_name} PUBLIC + $) + endif() + + if (ARG_POSITION_INDEPENDENT) + set_property (TARGET ${target_name} APPEND + PROPERTY COMPILE_FLAGS -fPIC + ) + endif() + + if (ARG_DEPENDS) + add_dependencies (${target_name} ${ARG_DEPENDS}) + endif() + + if (ARG_COMPILE_DEFINITIONS) + target_compile_definitions (${target_name} ${_scope_specifier} ${ARG_COMPILE_DEFINITIONS}) + endif() + + if (ARG_COMPILE_OPTIONS) + target_compile_options (${target_name} ${_scope_specifier} ${ARG_COMPILE_OPTIONS}) + endif() + + if (ARG_INSTALL) + install (TARGETS ${target_name} + LIBRARY DESTINATION "${ARG_INSTALL_DESTINATION}" + ARCHIVE DESTINATION "${ARG_INSTALL_DESTINATION}" + ) + endif() +endfunction() + diff --git a/cmake/add_test.cmake b/cmake/add_test.cmake new file mode 100644 index 00000000..a803e2cb --- /dev/null +++ b/cmake/add_test.cmake @@ -0,0 +1,174 @@ +include (parse_arguments) + +function (compile_tarantella_test) + set(one_value_options NAME DESCRIPTION) + set(multi_value_options SOURCES LIBRARIES INCLUDE_DIRECTORIES + SYSTEM_INCLUDE_DIRECTORIES ARGS COMPILE_FLAGS) + set(required_options NAME SOURCES) + + # save each argument into a variable named "ARG_argname" + _parse_arguments_with_unknown(ARG "${options}" "${one_value_options}" + "${multi_value_options}" "${required_options}" ${ARGN}) + + _default_if_unset(ARG_DESCRIPTION "${ARG_NAME}") + set(target_name ${ARG_NAME}) + + add_executable (${target_name} ${ARG_SOURCES}) + list (APPEND ARG_LIBRARIES Boost::unit_test_framework + Boost::dynamic_linking) + target_compile_definitions (${target_name} PRIVATE + "-DBOOST_TEST_MODULE=\"${ARG_DESCRIPTION}\"" + "-DBOOST_TEST_DYN_LINK") + + #! \note Use RPATH for all tests + set_property (TARGET ${target_name} PROPERTY BUILD_WITH_INSTALL_RPATH true) + set_property (TARGET ${target_name} APPEND PROPERTY + INSTALL_RPATH + ${Boost_INCLUDE_DIR}/../lib:${CMAKE_BINARY_DIR}) + + if (Boost_VERSION VERSION_EQUAL 1.60 OR Boost_VERSION VERSION_GREATER 1.60) + list (INSERT ARG_ARGS 0 "--") + endif() + + if (ARG_SYSTEM_INCLUDE_DIRECTORIES) + target_include_directories (${target_name} SYSTEM + ${ARG_SYSTEM_INCLUDE_DIRECTORIES}) + endif() + if (ARG_INCLUDE_DIRECTORIES) + target_include_directories (${target_name} PRIVATE ${ARG_INCLUDE_DIRECTORIES}) + endif() + + target_link_libraries (${target_name} ${ARG_LIBRARIES}) + if (ARG_COMPILE_FLAGS) + set_property (TARGET ${target_name} PROPERTY COMPILE_FLAGS ${ARG_COMPILE_FLAGS}) + endif() +endfunction() + +function (tarantella_gen_environment_paths) + set(multi_value_options VARIABLE_LIST) + set(required_options VARIABLE_LIST) + _parse_arguments(ARG "${options}" "${one_value_options}" + "${multi_value_options}" "${required_options}" ${ARGN}) + set(env_var_names PATH LIBRARY_PATH LD_LIBRARY_PATH DYLD_LIBRARY_PATH CPATH PYTHONPATH) + set(env_vars ) + + foreach (var_name ${env_var_names}) + if (DEFINED ENV{${var_name}}) + list(APPEND env_vars "${var_name}=$ENV{${var_name}}") + endif() + endforeach() + set(${ARG_VARIABLE_LIST} ${env_vars} PARENT_SCOPE) +endfunction() + +function (tarantella_gen_executable_script) + set(one_value_options SCRIPT_DIR SCRIPT_NAME) + set(required_options SCRIPT_DIR SCRIPT_NAME) + _parse_arguments(ARG "${options}" "${one_value_options}" + "${multi_value_options}" "${required_options}" ${ARGN}) + + set(tmp_script_path ${CMAKE_CURRENT_BINARY_DIR}/tmp/${ARG_SCRIPT_NAME}) + file(REMOVE ${ARG_SCRIPT_DIR}/${ARG_SCRIPT_NAME}) + file(WRITE ${tmp_script_path} "") + file(COPY ${tmp_script_path} + DESTINATION ${ARG_SCRIPT_DIR} + FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE + ) + file(REMOVE ${tmp_script_path}) +endfunction() + +function (tarantella_gen_gpi_machinefile) + set(one_value_options NRANKS FILENAME) + set(required_options NRANKS FILENAME) + _parse_arguments(ARG "${options}" "${one_value_options}" + "${multi_value_options}" "${required_options}" ${ARGN}) + + file(WRITE ${ARG_FILENAME} "") + cmake_host_system_information(RESULT hostname QUERY HOSTNAME) + foreach(index RANGE 1 ${ARG_NRANKS}) + file(APPEND ${ARG_FILENAME} "${hostname}\n") + endforeach() +endfunction() + +function (tarantella_gen_test_script) + set(one_value_options NAME SCRIPT_DIR TEST_FILE) + set(options IS_PYTHON_TEST) + set(required_options NAME SCRIPT_DIR TEST_FILE) + _parse_arguments_with_unknown(ARG "${options}" "${one_value_options}" + "${multi_value_options}" "${required_options}" ${ARGN}) + + message(STATUS "Test: Generating ${ARG_NAME} script") + tarantella_gen_executable_script(SCRIPT_NAME ${ARG_NAME} + SCRIPT_DIR ${ARG_SCRIPT_DIR}) + + tarantella_gen_environment_paths(VARIABLE_LIST env_paths) + + set(script_path ${ARG_SCRIPT_DIR}/${ARG_NAME}) + foreach (var ${env_paths}) + file(APPEND ${script_path} "export ${var}\n") + endforeach() + if (ARG_IS_PYTHON_TEST) + # Python test + file(APPEND ${script_path} "export PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_SOURCE_DIR}/src:\$\{PYTHONPATH\}\n") + file(APPEND ${script_path} "\n${Python_EXECUTABLE} -m pytest ${ARG_TEST_FILE}\n") + else() + # regular executable test + file(APPEND ${script_path} "\n${ARG_TEST_FILE}\n") + endif() +endfunction() + +function (tarantella_add_gpi_test) + set(one_value_options NAME TARGET_FILE NRANKS RUNCOMMAND TEST_FILE + MACHINEFILE CLEANUP TIMEOUT SLEEP) + set(multi_value_options LABELS) + set(required_options NAME TARGET_FILE NRANKS RUNCOMMAND) + _parse_arguments_with_unknown(ARG "${options}" "${one_value_options}" + "${multi_value_options}" "${required_options}" ${ARGN}) + _default_if_unset(ARG_SLEEP 0) + set(test_name ${ARG_NAME}_${ARG_NRANKS}ranks) + + # increase overall timeout time to include the sleep time after the actual test + if (ARG_TIMEOUT) + math(EXPR ARG_TIMEOUT "${ARG_SLEEP} + ${ARG_TIMEOUT}") + endif() + + if (ARG_MACHINEFILE) + # use user-defined machinefile + set(runparams "-n ${ARG_NRANKS} -m ${ARG_MACHINEFILE}") + else() + # generate machinefile for ARG_NRANKS running on the localhost + set(machinefile_path ${CMAKE_CURRENT_BINARY_DIR}/machinefile_${ARG_NAME}_${ARG_NRANKS}.tmp) + tarantella_gen_gpi_machinefile(NRANKS ${ARG_NRANKS} + FILENAME ${machinefile_path}) + set(runparams "-n ${ARG_NRANKS} -m ${machinefile_path}") + endif() + + # create gaspi_run test + add_test(NAME ${test_name} + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + COMMAND "${CMAKE_COMMAND}" + -DRUNCOMMAND=${ARG_RUNCOMMAND} + -DRUNCOMMAND_ARGS="${runparams}" + -DTEST_EXECUTABLE="${ARG_TARGET_FILE}" + -DTEST_DIR="${CMAKE_BINARY_DIR}" + -DSLEEP="${ARG_SLEEP}" + -P "${CMAKE_SOURCE_DIR}/cmake/run_test.cmake" + ) + + # set labels if specified + if (ARG_LABELS) + set_property(TEST ${test_name} PROPERTY LABELS ${ARG_LABELS}) + endif() + + # set cleanup fixture script if specified + if (ARG_CLEANUP) + set_tests_properties(${test_name} PROPERTIES FIXTURES_REQUIRED ${ARG_CLEANUP}) + endif() + + # set timeout if specified + if (ARG_TIMEOUT) + set_tests_properties(${test_name} PROPERTIES TIMEOUT ${ARG_TIMEOUT}) + endif() + + # make sure the GPI tests are not run in parallel + set_tests_properties(${test_name} PROPERTIES RESOURCE_LOCK GPI_run_serial) +endfunction() diff --git a/cmake/add_test_wrappers.cmake b/cmake/add_test_wrappers.cmake new file mode 100644 index 00000000..119c1ef8 --- /dev/null +++ b/cmake/add_test_wrappers.cmake @@ -0,0 +1,151 @@ +include (add_test) + +function (tarantella_compile_and_generate_gpi_test) + set (one_value_options NAME DESCRIPTION TIMEOUT) + set (multi_value_options LOCALRANKS_LIST SOURCES LIBRARIES INCLUDE_DIRECTORIES + SYSTEM_INCLUDE_DIRECTORIES ARGS COMPILE_FLAGS) + set (required_options NAME SOURCES LOCALRANKS_LIST) + _parse_arguments (ARG "${options}" "${one_value_options}" + "${multi_value_options}" "${required_options}" ${ARGN}) + _default_if_unset (ARG_TIMEOUT 10) + set(CLEANUP_TEST_NAME gpi_cleanup) + + set (target_name ${ARG_NAME}.test) + compile_tarantella_test(${ARGN} + NAME ${target_name}) + + # wrap call to the test executable in a script that exports the current environment + # the script can then be executed within a `gaspi_run` call + set(script_name run_${ARG_NAME}.sh) + set(script_path ${CMAKE_CURRENT_BINARY_DIR}/${script_name}) + tarantella_gen_test_script(NAME ${script_name} + SCRIPT_DIR ${CMAKE_CURRENT_BINARY_DIR} + TEST_FILE ${CMAKE_CURRENT_BINARY_DIR}/${target_name}) + + message(STATUS "Test: Generating gaspi_run tests for ${ARG_NAME} with ${ARG_LOCALRANKS_LIST} ranks") + foreach(nlocalranks ${ARG_LOCALRANKS_LIST}) + tarantella_add_gpi_test (NAME ${ARG_NAME} + NRANKS ${nlocalranks} + TARGET_FILE ${script_path} + TEST_FILE "${CMAKE_CURRENT_BINARY_DIR}/${target_name}" + RUNCOMMAND ${GPI2_GASPI_RUN} + CLEANUP ${CLEANUP_TEST_NAME} + TIMEOUT ${ARG_TIMEOUT} + SLEEP ${SLEEP_TIME_AFTER_TEST}) + endforeach() +endfunction() + +function (tarantella_compile_and_generate_test) + set (one_value_options NAME DESCRIPTION TIMEOUT) + set (multi_value_options SOURCES LIBRARIES INCLUDE_DIRECTORIES + SYSTEM_INCLUDE_DIRECTORIES ARGS COMPILE_FLAGS + LABELS) + set (required_options NAME SOURCES) + _parse_arguments (ARG "${options}" "${one_value_options}" + "${multi_value_options}" "${required_options}" ${ARGN}) + _default_if_unset (ARG_TIMEOUT 10) + + set (target_name ${ARG_NAME}.test) + compile_tarantella_test(${ARGN} + NAME ${target_name}) + add_test (NAME ${ARG_NAME} + COMMAND $ ${ARGS}) + + # set labels if specified + if (ARG_LABELS) + set_property(TEST ${test_name} PROPERTY LABELS ${ARG_LABELS}) + endif() + + # set timeout if specified + if (ARG_TIMEOUT) + set_tests_properties(${test_name} PROPERTIES TIMEOUT ${ARG_TIMEOUT}) + endif() +endfunction() + +function (tarantella_generate_python_gpi_test) + set (one_value_options NAME TEST_FILE DESCRIPTION TIMEOUT) + set (multi_value_options LOCALRANKS_LIST LABELS ARGS) + set (required_options NAME TEST_FILE LOCALRANKS_LIST) + _parse_arguments (ARG "${options}" "${one_value_options}" + "${multi_value_options}" "${required_options}" ${ARGN}) + set(CLEANUP_TEST_NAME gpi_cleanup) + _default_if_unset (ARG_TIMEOUT 600) + _default_if_unset (ARG_LABELS "Python") + + list(APPEND ARG_LABELS "Python") + list(REMOVE_DUPLICATES ARG_LABELS) + + # wrap call to the test executable in a script that exports the current environment + # the script can then be executed within a `gaspi_run` call + set(script_name run_${ARG_NAME}.sh) + set(script_path ${CMAKE_CURRENT_BINARY_DIR}/${script_name}) + tarantella_gen_test_script(NAME ${script_name} + SCRIPT_DIR ${CMAKE_CURRENT_BINARY_DIR} + TEST_FILE ${ARG_TEST_FILE} + IS_PYTHON_TEST) + + message(STATUS "Test: Generating gaspi_run tests for ${ARG_NAME} with ${ARG_LOCALRANKS_LIST} ranks") + foreach(nlocalranks ${ARG_LOCALRANKS_LIST}) + tarantella_add_gpi_test (NAME ${ARG_NAME} + NRANKS ${nlocalranks} + TARGET_FILE ${script_path} + TEST_FILE "${ARG_TEST_FILE}" + RUNCOMMAND ${GPI2_GASPI_RUN} + TIMEOUT ${ARG_TIMEOUT} + CLEANUP ${CLEANUP_TEST_NAME} + SLEEP ${SLEEP_TIME_AFTER_TEST} + LABELS ${ARG_LABELS}) + endforeach() +endfunction() + +function (tarantella_generate_python_test) + set (one_value_options NAME TEST_FILE DESCRIPTION TIMEOUT) + set (multi_value_options LABELS ARGS) + set (required_options NAME TEST_FILE) + _parse_arguments (ARG "${options}" "${one_value_options}" + "${multi_value_options}" "${required_options}" ${ARGN}) + set(CLEANUP_TEST_NAME gpi_cleanup) + _default_if_unset (ARG_TIMEOUT 600) + _default_if_unset (ARG_LABELS "Python") + + list(APPEND ARG_LABELS "Python") + list(REMOVE_DUPLICATES ARG_LABELS) + + # wrap call to the test executable in a script that exports the current environment + # the script can then be executed within a `gaspi_run` call + set(script_name run_${ARG_NAME}.sh) + set(script_path ${CMAKE_CURRENT_BINARY_DIR}/${script_name}) + tarantella_gen_test_script(NAME ${script_name} + SCRIPT_DIR ${CMAKE_CURRENT_BINARY_DIR} + TEST_FILE ${ARG_TEST_FILE} + IS_PYTHON_TEST) + + # create gaspi_run test + add_test(NAME ${ARG_NAME} + WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + COMMAND "${CMAKE_COMMAND}" + -DRUNCOMMAND=bash + -DRUNCOMMAND_ARGS=" " + -DTEST_EXECUTABLE="${script_path}" + -DTEST_DIR="${CMAKE_BINARY_DIR}" + -DSLEEP="1" + -P "${CMAKE_SOURCE_DIR}/cmake/run_test.cmake" + ) + + # set labels if specified + if (ARG_LABELS) + set_property(TEST ${ARG_NAME} PROPERTY LABELS ${ARG_LABELS}) + endif() + + # set cleanup fixture script if specified + if (ARG_CLEANUP) + set_tests_properties(${ARG_NAME} PROPERTIES FIXTURES_REQUIRED ${ARG_CLEANUP}) + endif() + + # set timeout if specified + if (ARG_TIMEOUT) + set_tests_properties(${ARG_NAME} PROPERTIES TIMEOUT ${ARG_TIMEOUT}) + endif() + + message(STATUS "Test: Generating test ${ARG_NAME}") +endfunction() diff --git a/cmake/cleanup.sh b/cmake/cleanup.sh new file mode 100644 index 00000000..2562bf59 --- /dev/null +++ b/cmake/cleanup.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +procs=`ps aux | grep --regexp="\(py\)\?test" | grep -v ctest | grep -v grep` +if [ -n "$procs" ] ;then + ps aux | grep --regexp="\(py\)\?test" | grep -v ctest | grep -v grep | awk '{print $2}' | xargs kill 2>&1 > /dev/null +fi diff --git a/cmake/parse_arguments.cmake b/cmake/parse_arguments.cmake new file mode 100644 index 00000000..2d4290d8 --- /dev/null +++ b/cmake/parse_arguments.cmake @@ -0,0 +1,27 @@ +# equivalent to CMakeParseArguments except that parse_arguments +# * forbids UNPARSED_ARGUMENTS but requires to explicitly use +# parse_arguments_with_unknown +# * allows to specify required arguments + +include (CMakeParseArguments) + +macro (_parse_arguments _prefix _options _one_value_options _multi_value_options _required_options) + _parse_arguments_with_unknown ("${_prefix}" "${_options}" "${_one_value_options}" "${_multi_value_options}" "${_required_options}" ${ARGN}) + + if (${_prefix}_UNPARSED_ARGUMENTS) + list (LENGTH ${_prefix}_UNPARSED_ARGUMENTS _unparsed_length) + if (NOT _unparsed_length EQUAL 0) + message (FATAL_ERROR "unknown arguments: ${${_prefix}_UNPARSED_ARGUMENTS}") + endif() + endif() +endmacro() + +macro (_parse_arguments_with_unknown _prefix _options _one_value_options _multi_value_options _required_options) + cmake_parse_arguments ("${_prefix}" "${_options}" "${_one_value_options}" "${_multi_value_options}" ${ARGN}) + + foreach (required ${_required_options}) + if (NOT ${_prefix}_${required}) + message (FATAL_ERROR "required argument ${required} missing") + endif() + endforeach() +endmacro() diff --git a/cmake/run_test.cmake b/cmake/run_test.cmake new file mode 100644 index 00000000..88ef0e67 --- /dev/null +++ b/cmake/run_test.cmake @@ -0,0 +1,50 @@ +# Kill old processes that may be still running +function (kill_old_processes) + set(one_value_options TEST_DIR TEST_EXECUTABLE) + cmake_parse_arguments(ARG "${options}" "${one_value_options}" + "${multi_value_options}" ${ARGN}) + + set(find_processes_command "ps -ef | grep ${ARG_TEST_DIR} | grep -v grep | grep -v ${ARG_TEST_EXECUTABLE}") + set(kill_command "${find_processes_command} | awk '{print $2}' | xargs -r kill -9") + + execute_process(COMMAND sh -c "echo \"Killing `${find_processes_command} | wc -l` processes\"; ${find_processes_command}") + execute_process(COMMAND sh -c "${kill_command}" + COMMAND_ECHO STDOUT) +endfunction() + +foreach(var TEST_DIR TEST_EXECUTABLE RUNCOMMAND RUNCOMMAND_ARGS SLEEP) + if(NOT DEFINED ${var}) + message(FATAL_ERROR "'${var}' must be defined on the command line") + endif() + + separate_arguments(var_value UNIX_COMMAND "${${var}}") + string(LENGTH "${var_value}" var_length) + if (var_length LESS 1) + message(FATAL_ERROR "'${var}' must be defined on the command line and not be empty") + endif() +endforeach() + +separate_arguments(runparams_list UNIX_COMMAND "${RUNCOMMAND_ARGS}") +separate_arguments(all_command_params UNIX_COMMAND + "${runparams_list} ${TEST_EXECUTABLE} ${TEST_ARGS}") +kill_old_processes(TEST_DIR ${TEST_DIR} + TEST_EXECUTABLE ${TEST_EXECUTABLE}) + +# Execute the test-executable +execute_process(COMMAND ${RUNCOMMAND} ${all_command_params} + COMMAND_ECHO STDOUT + RESULT_VARIABLE result) + +# Sleep to ensure all processes are done and kill the remainder +separate_arguments(sleep_time UNIX_COMMAND "${SLEEP}") +execute_process(COMMAND ${CMAKE_COMMAND} -E sleep "${sleep_time}" + COMMAND ${CMAKE_COMMAND} -E echo "Sleep ${sleep_time}") +kill_old_processes(TEST_DIR ${TEST_DIR} + TEST_EXECUTABLE ${TEST_EXECUTABLE}) + +# Check return status +if(result) + message(FATAL_ERROR "Test failed:'${result}'") +endif() + + diff --git a/cmake/version.py.in b/cmake/version.py.in new file mode 100644 index 00000000..863f92f9 --- /dev/null +++ b/cmake/version.py.in @@ -0,0 +1,2 @@ +global tnt_version +tnt_version = "@PROJECT_VERSION@" diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt new file mode 100644 index 00000000..afe5b076 --- /dev/null +++ b/docs/CMakeLists.txt @@ -0,0 +1,17 @@ +set(SPHINX_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/source) +set(SPHINX_BUILD ${CMAKE_CURRENT_BINARY_DIR}/) + +if (Sphinx_FOUND) + add_custom_target(docs ALL + COMMAND + Sphinx::Sphinx -b html + -Drelease=${PROJECT_VERSION} + ${SPHINX_SOURCE} ${SPHINX_BUILD} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + COMMENT "Generating documentation with Sphinx") + + install(DIRECTORY ${SPHINX_BUILD} + DESTINATION ${CMAKE_INSTALL_PREFIX}/docs) +else() + message(STATUS "Sphinx not found. Skipping documentation build.") +endif() \ No newline at end of file diff --git a/docs/source/advanced_topics.rst b/docs/source/advanced_topics.rst new file mode 100644 index 00000000..28caa105 --- /dev/null +++ b/docs/source/advanced_topics.rst @@ -0,0 +1,144 @@ +Advanced Topics +=============== + +This guide covers a number of advanced topics, such as +performance, reproducibility and user customization. + + +.. _ranks-label: + +GASPI ranks +^^^^^^^^^^^ + +In order to execute distributed DNN training, Tarantella starts multiple processes +on different devices. These processes will be assigned different IDs by the GASPI +communication library, in order to organize communication and synchronization between +the different devices. These IDs are called *ranks*. Usually, Tarantella abstracts away +the concept of *ranks*, in such a way that Tarantella's user interface is essentially +the same as Keras' user interface. + +However, sometimes it is useful, to execute a specific part of code only on one +or a subgroup of all ranks. In particular, one sometimes wants to execute a code +block on the devices that started ``tarantella``, the so-called *master rank*. + +To access ranks, Tarantella provides the following functions + +* ``tnt.get_rank()`` +* ``tnt.get_size()`` +* ``tnt.get_master_rank()`` +* ``tnt.is_master_rank()`` + +``tnt.get_rank()`` returns the ID of the local rank. +``tnt.get_size()`` returns the total number of ranks. +``tnt.get_master_rank()`` and ``tnt.is_master_rank()`` return the ID of the master rank +and a boolean for whether the local rank is the master rank or not, respectively. + +Here is a simple example, when using the master rank can be useful to print notifications +only once to ``stdout``: + +.. code-block:: python + + if tnt.is_master_rank(): + print("Printing from the master rank") + +In the same vein, you might want to use ranks to execute :ref:`callbacks ` for logging +only on one rank: + +.. code-block:: python + + history_callback = tf.keras.callbacks.History() + tnt_model.fit(train_dataset, + callbacks = [history_callback] if tnt.is_master_rank() else []) + + +.. _using-local-batch-sizes-label: + +Using local batch sizes +^^^^^^^^^^^^^^^^^^^^^^^ + +As it has been stated in the :ref:`points to consider `, when using +Tarantella the user always specifies the *global* batch size. This has the advantage that +the optimization process during the training of a DNN, and in particular the loss function do not +depend on the number of devices used during execution. + +However, when the number of devices becomes +very large, the (device-local) micro-batch size might become so small, that DNN kernel implementations +are less efficient, resulting in overall performance degradation. +This is why it is in practice often advisable to scale the global batch size with the number of nodes. +This will often lead to linear speedups in terms of the time to accuracy when increasing +the number of devices used, at least up to some *critical batch size*, cf. [Shallue]_ and [McCandlish]_. +Changing the batch size of the optimizer will however also imply the need to adapt the learning rate +schedule. + +.. todo:: + + Enable when the Tutorial is updated: + For details, cf. for instance the :ref:`ResNet-50 tutorial `. + +If you decide to scale the batch size with the number of nodes, Tarantella provides +two different ways to achieve this easily. The first option is to multiply the local batch size +(for instance passed via a command-line parameter) with the number of devices used, +batch your dataset with it, and call ``fit`` on it: + +.. code-block:: python + + micro_batch_size = args.micro_batch_size + batch_size = tnt.get_size() * micro_batch_size + train_dataset = train_dataset.batch(batch_size) + tnt_model.fit(train_dataset) + +As a second option you can also pass the local batch size directly to the ``tnt_micro_batch_size`` +parameter in fit, and leave your dataset unbatched: + +.. code-block:: python + + micro_batch_size = args.micro_batch_size + tnt_model.fit(train_dataset, + tnt_micro_batch_size = micro_batch_size) + +This parameter is also available in ``evaluate`` and ``predict``. In addition, ``fit`` also supports +setting the validation set micro batch size in a similar way with ``tnt_validation_micro_batch_size``. +For more information, please also read :ref:`using distributed datasets `. + + +.. _tensor-fusion-threshold-label: + +Setting Tensor Fusion threshold +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Tarantella automatically uses :ref:`Tensor Fusion ` with a default +threshold of 32kB. This threshold specifies the minimal size of local buffers in *allreduce* +communication operations used to accumulate partial gradients during *backpropagation*. + +Note that the threshold value implies a trade-off between the potential to utilize network +bandwidth, and the overlap of computation and communication during *backpropagation*. The +larger the threshold, the more bandwidth-bound the *allreduce* algorithm will get, but +the less potential there will be to overlap its execution with kernel computations. +Also note that the ideal threshold value will generally depend on the number of nodes used. + +To change the default value, you can pass a threshold value in kB to ``tarantella``: + +.. code-block:: bash + + tarantella --hostfile hostfile --fusion-threshold= -- model.py + + +.. _reproducibility-label: + +Reproducibility +^^^^^^^^^^^^^^^ + +Reproducibility is a very important prerequisite to obtain meaningful results in +scientific computing and research. Unfortunately, using stochastic algorithms, +pseudo random generators and having to deal with the pitfalls of floating-point arithmetics, +it is particularly difficult to achieve reproducibility in Deep Learning research. + +In order to be able to reproduce results obtained with TensorFlow, when running in +a multi-node/multi-device setting with Tarantella, one needs to meet at least +the following requirements: + +* set the random seed with ``tf.random.set_seed(seed)`` +* set the environment variable ``os.environ['TF_CUDNN_DETERMINISTIC']='1'`` +* set the shuffle seeds when using ``tf.data.Dataset`` with ``shuffle(seed=seed)`` and ``list_files(seed=seed)`` +* set the ``deterministic`` parameter to ``True`` in ``Dataset`` transformations such as ``interleave`` and ``map`` +* make sure the number of samples in your datasets equal a multiple of ``batch_size`` diff --git a/docs/source/bug_reports.rst b/docs/source/bug_reports.rst new file mode 100644 index 00000000..20f632ec --- /dev/null +++ b/docs/source/bug_reports.rst @@ -0,0 +1,35 @@ +.. _bug-reports-label: + +Bug Reports +=========== + +To report a bug please open an `issue on GitHub `_. + +When opening an issue, please make sure you include as much +information as possible about the issue. Please consider providing at +least the following points: + + * What version of Tarantella you are using + * What linux distribution you are using (e.g., Linux Ubuntu 20.04) + * What kind of system you are experiencing the issue on (type and + number of nodes, network interconnect, etc.) + * What did you expect to see and what have you seen instead + * What exact steps are needed to reproduce the issue + +.. _feature-requests-label: + +Feature Requests +================ + +For contributions other than modifications to the source code, as for +example suggestions of a feature or enhancement, please open +an `issue on GitHub `_ +with the label ``Feature``. + +When providing a feature request, please consider providing at least +the following information: + + * What is the current behavior of the software and how does the feature improve it + * Who would benefit from the feature + * Is there a relevant reference or academic paper describing the feature + * Are you willing to contribute to and/or maintain the feature diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000..cc2e7e3b --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,72 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'Tarantella' +copyright = '2020 Fraunhofer' +author = 'Peter Labus, Alexandra Carpen-Amarie, Martin Kuehn' + +# The full version, including alpha/beta/rc tags +release = '0' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ['sphinx.ext.todo'] +try: + import sphinx_rtd_theme + extensions += ['sphinx_rtd_theme'] +except: + pass + +# Display TODOs by setting to True +todo_include_todos = False + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +# html_theme = 'alabaster' # default +try: + import sphinx_rtd_theme + html_theme = "sphinx_rtd_theme" +except: + pass + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] +html_title = release +html_theme_options = dict() +html_theme_options ['logo_only'] = False +# html_theme_options['display_version']= True +# html_logo = "pics/tnt_logo.png" diff --git a/docs/source/contact.rst b/docs/source/contact.rst new file mode 100644 index 00000000..472374e2 --- /dev/null +++ b/docs/source/contact.rst @@ -0,0 +1,14 @@ +.. _contact-label: + +Contact +======= + +In case you have any feature request, +or want to report a bug please follow +:ref:`these instructions `. + +If you consider contributing to Tarantella, please follow +the instructions :ref:`here `. + +If you have any further questions or comments please email to +support@tarantella.org diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst new file mode 100644 index 00000000..08cdd8ad --- /dev/null +++ b/docs/source/contributing.rst @@ -0,0 +1,20 @@ +.. _contributing-label: + +Contributing +============ + +Thank you for considering to contribute to Tarantella. + +There are many ways to contribute to Tarantella. +This includes sharing DNN models distributed through Tarantella, +providing suggestions on improving the documentation, +as well as contributing with changes to the +`Tarantella code base `_. +Even by simply providing suggestions on how we can +:ref:`improve Tarantella ` +and help spreading the word about it are great ways to contribute +and make Tarantella better software. + +If you want to contribute to Tarantella with changes to its code, +please open a `pull request `_ +on GitHub. diff --git a/docs/source/data_parallel.rst b/docs/source/data_parallel.rst new file mode 100644 index 00000000..0a914d47 --- /dev/null +++ b/docs/source/data_parallel.rst @@ -0,0 +1,195 @@ +Distributed Data Parallel Training +================================== + +The following section explains the parallelization strategy Tarantella uses to +provide distributed training. A full understanding thereof is, however, not required +to be able to use the software. Please note the :ref:`points to consider ` +to achieve best performance and reproducibility. + +The general idea +---------------- + +In order to parallelize the training of DNNs, different, complementary strategies are available. +The conceptually simplest and most efficient one is called *data parallelism*. This strategy +is already in use when deploying batched optimizers, such as stochastic gradient descent (SGD) +or ADAM. In this case, input samples are grouped together in so-called mini-batches and +are processed in parallel. + +Distribution of mini-batches +---------------------------- + +Tarantella extends this scheme by splitting each mini-batch into a number of micro-batches, +which are then executed on different devices (e.g., GPUs). +In order to do this, the DNN is replicated on each device, +which then processes part of the data independently of the other devices. +During the *backpropagation* pass, partial results need to be accumulated via a so-called +`allreduce `_ +collective operation. + +Overlapping communication with computation +------------------------------------------ + +Tarantella implements this communication scheme using the +`Global Address Space Programming Interface (GASPI) `_. +This allows in particular to overlap the communication needed to execute *allreduce* operations +with the computation done in the *backpropagation* part of the DNN training. +This is done by starting *allreduce* operations as soon as the required local incoming gradients are +available, while continuing with *backpropagation* calculations at the same time. +The final, accumulated gradients are only expected once the entire *backpropagation* is completed. +This drastically mitigates the communication overhead introduced by the need to synchronize +the different devices, and leads to higher scalability. + +.. _tensor-fusion-label: + +Tensor Fusion +------------- + +The granularity at which Tarantella executes *allreduce* operations can be varied from +one *allreduce* per layer (finest granularity) to one *allreduce* per iteration (coarsest granularity). +Using coarser granularities, i.e., *fusing* gradient tensors, +can lead to better bandwidth utilization, thus potentially increasing performance. +*Tensor Fusion* is set up before the first iteration of training and incurs no additional communication overhead. +Tarantella enables *Tensor Fusion* by default, but its granularity can be adjusted by the user, +cf. :ref:`here `. + +Model initialization and loading +-------------------------------- + +In order to guarantee that all devices have the same copy of the DNN when training is initiated, +the model needs to be communicated from one device to all the others. +This is done in Tarantella via the use of a so-called +`broadcast `_ operation. +This scheme applies both when the weights of a DNN are initialized randomly, +or loaded from a checkpoint. +As Tarantella provides this functionality automatically, +the user does not have to take care of it. + +.. _points-to-consider-label: + +Distributed Datasets +===================== + +In order to process micro-batches independently on each device and to obtain the same results +as in serial execution, the input data of each mini-batch has to be split and distributed +among all devices. + +Tarantella automatically takes care of this through the use of distributed datasets. +The user simply provides Tarantella with a ``tf.data.Dataset`` that is batched +with the mini-batch size. Tarantella will then automatically distribute the input data +by sharding the mini-batch into individual micro-batches. Sharding is done at the level +of samples (as opposed to e.g., files) to ensure :ref:`reproducibility ` +of serial results. + +To guarantee reproducibility, it is also important that shuffling of samples is done +in the same way on all devices. Tarantella does this using either the ``seed`` provided +by the user, or a specific default seed. Please refer to the +:ref:`Quick Start ` +for more details. + +Points to Consider +================== + +.. _global-vs-local-batch-size-label: + +Global versus local batch size +------------------------------ + +As explained above, when using data parallelism, there exists a *mini-batch size* +(in the following also called global batch size or simply batch size) +as well as a *micro-batch size* (also called local batch size). +The former represents the number of samples that +is averaged over in the loss function of the optimizer, and is equivalent to +the (mini-)batch size used in non-distributed training. The latter is the number +of samples that is processed locally by each of the devices per iteration. + +.. note:: + + In Tarantella, the user always specifies the **global batch size**. + +Using a strictly synchronous optimization scheme, and by carefully handling the data distribution, +**Tarantella guarantees the reproducibility of DNN training results independently of the number of +devices used**, as long as all hyperparameters (such as global batch size and learning rate) +are kept constant. [#footnote_random_seeds]_ + +However, to achieve best performance for certain DNN operators (`Conv2D`, `Dense`, etc.) +it is often advisable to *keep the local batch size constant*, and scale the global +batch size with the number of devices used. This, in turn, will force you to +adjust other hyperparameters, such as the learning rate, in order to converge +to a comparable test accuracy, as observed for instance in [Shallue]_. + +In practice, the use of a learning rate schedule with initial *warm up* and +a *linear learning rate scaling* [Goyal]_, as it is described +:ref:`here `, often suffices. + +.. tip:: + + For best performance, scale the batch size with the number of devices used, + and :ref:`adapt the learning rate schedule `. + +Batch normalization layers +-------------------------- + +The issue of global versus local batch size particularly affects the layers +that calculate (and learn) statistics over entire batches. +A well-known example of this type of layer is +`batch normalization `_. + +.. caution:: + + Tarantella always calculates batch statistics over **local batches**. + +As a consequence, the training results for DNNs with batch-normalization layers +**will not be identical when changing the number of devices, even if +the global batch size stays the same.** +At the moment, this can be circumvented by using normalization layers that +do *not* average over entire batches, such as instance normalization +[Ulyanov]_. + +Averaging over *local* batches instead of global batches should in practice +have only minor influence on the quality of the final test accuracy. +Note however, the extreme case of very small *local* batch sizes. + +.. caution:: + + Avoid using ``BatchNormalization`` layers when the global batch size + divided by the number of devices used is *smaller than 16*. + +In such cases, the local batches that are used to collect statistics are +too small to obtain meaningful results. This will likely reduce the +benefits of batch normalization, cf. for instance [Yang]_ and [Uppal]_. +In this case, please consider increasing the global batch size, +or reducing the number of devices used. + +Managing individual devices +--------------------------- + +Although Tarantella's user interface abstracts away most of the details of +parallel programming, it is sometimes useful to be able to control +Python code execution at device level. This can be achieved using the +`GASPI `_ concept +of a ``rank``. Details on how to do this can be found in the +:ref:`advanced topics `. + +.. rubric:: References + +.. [Shallue] Shallue, Christopher J., et al. "Measuring the effects of data parallelism on neural network training." arXiv preprint arXiv:1811.03600 (2018). + +.. [Ulyanov] Ulyanov, Dmitry, Andrea Vedaldi, and Victor Lempitsky. "Instance normalization: The missing ingredient for fast stylization." arXiv preprint arXiv:1607.08022 (2016). + +.. [Goyal] Goyal, Priya, et al. "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour." arXiv preprint arXiv:1706.02677 (2017). + +.. [Yang] Yang, Greg, et al. "A mean field theory of batch normalization." arXiv preprint arXiv:1902.08129 (2019). + +.. [Uppal] https://towardsdatascience.com/curse-of-batch-normalization-8e6dd20bc304 + +.. [McCandlish] McCandlish, Sam, et al. "An empirical model of large-batch training." arXiv preprint arXiv:1812.06162 (2018). + +.. [He] He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016. + +.. [Vaswani] Vaswani, Ashish, et al. "Attention is all you need." Advances in neural information processing systems. 2017. + +.. rubric:: Footnotes + +.. [#footnote_random_seeds] This is strictly true, only when all randomness in TensorFlow is + seeded or switched off, as explained in the :ref:`advanced topics ` + diff --git a/docs/source/faq.rst b/docs/source/faq.rst new file mode 100644 index 00000000..b2ba811c --- /dev/null +++ b/docs/source/faq.rst @@ -0,0 +1,80 @@ +.. _faq-label: + +Frequently Asked Questions (FAQ) +================================ + +This is a list of frequently asked questions about Tarantella. +Please feel free to :ref:`suggest new ones `! + +.. admonition:: Question + + How can I ssh to ``localhost`` without password? + +In order to run Tarantella programs, you will need to be able to ssh to ``localhost`` without password. +In order to do that generate ``ssh`` keys first: + +.. code-block:: bash + + cd ~/.ssh + ssh-keygen + +Make sure not to overwrite existing keys. +When asked for a passphrase, ``Enter passphrase (empty for no passphrase):``, simply leave empty +and return with enter. +Also take specific care to set correct user rights on all files in ``.ssh``, +cf. for instance `here `__. +Next, append the public key to the ``authorized_keys`` file: + +.. code-block:: bash + + cat id_rsa.pub >> authorized_keys + +Now, install and start an ssh server, e.g., openssh-server on Fedora. +More details can be found for instance +`here `__. + +.. admonition:: Question + + I get an execution error ``GPI library initialization incorrect environment vars`` when + trying to run my script. What shall I do? + +Most likely you are running your program with ``python my_script.py`` or ``./my_script.py``. +Please make sure to execute your code with ``tarantella my_script.py`` instead. + +.. admonition:: Question + + I get an execution error ``GPI library initialization general error``. What shall I do? + +This error occurs when the GASPI library tries to connect to a previously used socket, that is not yet released. +Try to re-run your code after a short while so that the port becomes available again. + +.. admonition:: Question + + The execution seems to stall. What shall I do? + +Please kill any processes that might be still running from a previous (aborted) call to ``tarantella``. + +.. admonition:: Question + + | When trying to build Tarantella, CMake cannot find pybind11: + | ``Could not find a package configuration file provided by "pybind11" with any`` + | ``of the following names: [...]`` + | What shall I do? + +This error occurs when pybind11 is installed using pip. +Please instead use conda, as recommended in the :ref:`installation guide `. + +.. admonition:: Question + + When trying to build Tarantella, CMake does not detect the Python interpreter from the + active conda environment. What shall I do? + +You will need to manually add the path to the conda environment's ``bin`` directory to your ``PATH``. +You will also need to specify the path to the python library on the command line when configuring Tarantella: + +.. code-block:: bash + + PATH_TO_CONDA_ENV=/path/to/conda/env + export PATH=${PATH_TO_CONDA_ENV}/bin:${PATH} + cmake -DPYTHON_EXECUTABLE=${PATH_TO_CONDA_ENV}/bin/python \ + -DPYTHON_LIBRARY=${PATH_TO_CONDA_ENV}/lib ../ diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 00000000..8a3a7e37 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,46 @@ +.. image:: pics/tnt_logo_text.png + :width: 750 + :align: center + +| +`Tarantella `_ +is an open-source, distributed Deep Learning framework built on top of TensorFlow 2, +providing scalable Deep Neural Network training on CPU and GPU compute clusters. + +Tarantella is easy-to-use, allows to re-use existing TensorFlow 2/Keras models, +and does not require any knowledge of parallel computing. + +.. image:: pics/tnt_run.gif + :width: 750 + :align: center + +| + +Table of contents +================= + +.. toctree:: + :maxdepth: 2 + :caption: Overview + + why_tarantella + data_parallel + +.. toctree:: + :maxdepth: 2 + :caption: Getting started + + installation + quick_start + tutorials + advanced_topics + faq + +.. toctree:: + :maxdepth: 2 + :caption: Community + + bug_reports + contributing + contact + license diff --git a/docs/source/installation.rst b/docs/source/installation.rst new file mode 100644 index 00000000..6bb6f809 --- /dev/null +++ b/docs/source/installation.rst @@ -0,0 +1,197 @@ +.. _installation-label: + +Installation +============ + +Tarantella needs to be built `from source `_. +Since Tarantella is built on top of `TensorFlow 2 `_, +you will require a recent version of it. Additionally, you will need an installation of +the open-source communication library `GPI-2 `_, which Tarantella uses +to communicate between processes. +Lastly, you will need `pybind11 `_, which is required +for Python and C++ inter-communication. + +In the following we will look at the required steps in detail. + +Installing dependencies +----------------------- + +Compiler and build system +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Tarantella can be built using a recent `gcc `_ +compiler (from version ``7.4.0``). +You will also need the build tool `CMake `_ (from version ``3.8``). + +Installing GPI-2 +^^^^^^^^^^^^^^^^ + +Next, you will need to download, compile and install the GPI-2 library. +The currently supported version is ``v1.4.0``, which needs to be built with +position independent flags (``-fPIC``). + +To download the required version, clone the +`git repository `_ +and checkout the correct ``tag``: + +.. code-block:: bash + + git clone https://github.com/cc-hpc-itwm/GPI-2.git + cd GPI-2 + git fetch --tags + git checkout -b v1.4.0 v1.4.0 + +Now, use `autotools `_ to configure and compile the code + +.. code-block:: bash + + ./autogen.sh + export GPI2_INSTALLATION_PATH=/your/installation/path + CFLAGS="-fPIC" CPPFLAGS="-fPIC" ./configure --with-ethernet --prefix=${GPI2_INSTALLATION_PATH} + make + +where ``${GPI2_INSTALLATION_PATH}`` needs to be replaced with the path where you want to install +GPI-2. Note the ``--with-ethernet`` option, which will use standard TCP sockets for communication. +This is the correct option for laptops and workstations. + +In case you want to use Infiniband, replace the above option with ``--with-infiniband``. +Now you are ready to install GPI-2 with + +.. code-block:: bash + + make install + export PATH=${GPI2_INSTALLATION_PATH}/bin:$PATH + export LD_LIBRARY_PATH=${GPI2_INSTALLATION_PATH}/lib64:$LD_LIBRARY_PATH + +where the last two commands make the library visible to your system. +If required, GPI-2 can be removed from the target directory by using ``make uninstall``. + +Installing TensorFlow 2 +^^^^^^^^^^^^^^^^^^^^^^^ + +Next you will need to install TensorFlow 2. +Tarantella supports TensorFlow versions ``2.0`` to ``2.2``. +Either version can be installed in a conda environment using pip, +as recommended on the `TensorFlow website `_. + +In order to do that, first install `conda `_ on your system. +Then, create and activate an environment for Tarantella: + +.. code-block:: bash + + conda create tarantella + conda activate tarantella + +Now, you can install the latest supported TensorFlow version with + +.. code-block:: bash + + conda install python=3.7 + pip install --upgrade tensorflow==2.2 + +.. _installation-pybind11-label: + +Installing pybind11 +^^^^^^^^^^^^^^^^^^^ + +The last dependency you will need to install is +`pybind11 `__, +which is available through pip and conda. +We recommend installing pybind11 via conda: + +.. code-block:: bash + + conda install pybind11 -c conda-forge + +SSH key-based authentication +---------------------------- + +In order to use Tarantella on a cluster, make sure you can ssh between nodes +without password. For details, refer to the :ref:`FAQ section `. +In particular, to test Tarantella on your local machine, make sure +you can ssh to ``localhost`` without password. + +Building Tarantella from source +------------------------------- + +With all dependencies installed, we can now download, configure and compile Tarantella. +To download the source code, simply clone the +`GitHub repository `__: + +.. code-block:: bash + + git clone https://github.com/cc-hpc-itwm/tarantella.git + +Next, we need to configure the build system using CMake. +For a standard out-of-source build, we create a separate ``build`` folder and run ``cmake`` +in it: + +.. code-block:: bash + + cd tarantella + mkdir build && cd build + export TARANTELLA_INSTALLATION_PATH=/your/installation/path + cmake -DCMAKE_INSTALL_PREFIX=${TARANTELLA_INSTALLATION_PATH} .. + +Now, we can compile and install Tarantella to ``TARANTELLA_INSTALLATION_PATH``: + +.. code-block:: bash + + make + make install + export PATH=${TARANTELLA_INSTALLATION_PATH}/bin:${PATH} + +[Optional] Building and running tests +------------------------------------- + +In order to build Tarantella with tests, you will also need to install +`Boost `_ +(for C++ tests), and `pytest `_ (for Python tests). + +To install boost with the required `devel`-packages, under Ubuntu you can use + +.. code-block:: bash + + sudo apt install libboost-all-dev + +while in Fedora you can use + +.. code-block:: bash + + sudo dnf install boost boost-devel + +To install pytest you can use pip: + +.. code-block:: bash + + pip install -U pytest + +After having installed these libraries, make sure to configure Tarantella with testing switched on: + +.. code-block:: bash + + cmake -DENABLE_TESTING=ON .. + +Now you can compile Tarantella and run its tests in the ``build`` directory. + +.. code-block:: bash + + make + ctest + +[Optional] Building documentation +--------------------------------- + +If you would like to build `the documentation `_ +locally, run the following ``cmake`` command + +.. code-block:: bash + + cmake -DCMAKE_INSTALL_PREFIX=${TARANTELLA_INSTALLATION_PATH} -DBUILD_DOCS=ON .. + +before compiling. +This requires you to have `Sphinx `_ installed: + +.. code-block:: bash + + pip install -U sphinx diff --git a/docs/source/license.rst b/docs/source/license.rst new file mode 100644 index 00000000..b46bf268 --- /dev/null +++ b/docs/source/license.rst @@ -0,0 +1,5 @@ +License +======= + +.. literalinclude:: ../../LICENSE + :language: text diff --git a/docs/source/model.py b/docs/source/model.py new file mode 100644 index 00000000..2845f141 --- /dev/null +++ b/docs/source/model.py @@ -0,0 +1,89 @@ +import argparse +import tensorflow as tf +from tensorflow import keras + +import tarantella as tnt +tnt.init() + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-bs", "--batch_size", type=int, default=64) + parser.add_argument("-e", "--number_epochs", type=int, default=1) + parser.add_argument("-lr", "--learning_rate", type=float, default=0.01) + parser.add_argument("-train", "--train_size", type=int, default=48000) + parser.add_argument("-val", "--val_size", type=int, default=6400) + parser.add_argument("-test", "--test_size", type=int, default=6400) + args = parser.parse_args() + return args + +def mnist_as_np_arrays(training_samples, validation_samples, test_samples): + mnist_train_size = 60000 + mnist_test_size = 10000 + assert(training_samples + validation_samples <= mnist_train_size) + assert(test_samples <= mnist_test_size) + + # load given number of samples + (x_train_all, y_train_all), (x_test_all, y_test_all) = \ + keras.datasets.mnist.load_data() + x_train = x_train_all[:training_samples] + y_train = y_train_all[:training_samples] + x_val = x_train_all[training_samples:training_samples+validation_samples] + y_val = y_train_all[training_samples:training_samples+validation_samples] + x_test = x_test_all[:test_samples] + y_test = y_test_all[:test_samples] + + # normalization and reshape + x_train = x_train.reshape(training_samples,28,28,1).astype('float32') / 255. + x_val = x_val.reshape(validation_samples,28,28,1).astype('float32') / 255. + x_test = x_test.reshape(test_samples,28,28,1).astype('float32') / 255. + y_train = y_train.astype('float32') + y_val = y_val.astype('float32') + y_test = y_test.astype('float32') + + return (x_train, y_train), (x_val, y_val), (x_test, y_test) + +def lenet5_model_generator(): + inputs = keras.Input(shape=(28,28,1,), name='input') + x = keras.layers.Conv2D(20, 5, padding="same", activation='relu')(inputs) + x = keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) + x = keras.layers.Conv2D(50, 5, padding="same", activation='relu')(x) + x = keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) + x = keras.layers.Flatten()(x) + x = keras.layers.Dense(500, activation='relu')(x) + outputs = keras.layers.Dense(10, activation='softmax')(x) + return keras.Model(inputs=inputs, outputs=outputs) + +args = parse_args() + +# Create Tarantella model +model = tnt.Model(lenet5_model_generator()) + +# Compile Tarantella model (as with Keras) +model.compile(optimizer = keras.optimizers.SGD(learning_rate=args.learning_rate), + loss = keras.losses.SparseCategoricalCrossentropy(), + metrics = [keras.metrics.SparseCategoricalAccuracy()]) + +# Load MNIST dataset (as with Keras) +shuffle_seed = 42 +(x_train, y_train), (x_val, y_val), (x_test, y_test) = \ + mnist_as_np_arrays(args.train_size, args.val_size, args.test_size) + +train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) +train_dataset = train_dataset.shuffle(len(x_train), shuffle_seed) +train_dataset = train_dataset.batch(args.batch_size) +train_dataset = train_dataset.prefetch(buffer_size = tf.data.experimental.AUTOTUNE) + +val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)) +val_dataset = val_dataset.batch(args.batch_size) + +test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)) +test_dataset = test_dataset.batch(args.batch_size) + +# Train Tarantella model (as with Keras) +model.fit(train_dataset, + validation_data = val_dataset, + epochs = args.number_epochs, + verbose = 1) + +# Evaluate Tarantella model (as with Keras) +model.evaluate(test_dataset, verbose = 1) diff --git a/docs/source/pics/tnt_logo.png b/docs/source/pics/tnt_logo.png new file mode 100644 index 00000000..87f475be Binary files /dev/null and b/docs/source/pics/tnt_logo.png differ diff --git a/docs/source/pics/tnt_logo_text.png b/docs/source/pics/tnt_logo_text.png new file mode 100644 index 00000000..4a829d20 Binary files /dev/null and b/docs/source/pics/tnt_logo_text.png differ diff --git a/docs/source/pics/tnt_run.gif b/docs/source/pics/tnt_run.gif new file mode 100644 index 00000000..cc39935a Binary files /dev/null and b/docs/source/pics/tnt_run.gif differ diff --git a/docs/source/quick_start.rst b/docs/source/quick_start.rst new file mode 100644 index 00000000..36225ff7 --- /dev/null +++ b/docs/source/quick_start.rst @@ -0,0 +1,455 @@ +.. _quick-start-label: + +Quick Start +=========== + +This section explains how to get started using Tarantella to distributedly +train an existing TensorFlow 2/Keras model. +First, we will examine what changes have to be made to your code, before we will look into +the execution of your script with ``tarantella`` on the command line. +Finally, we will present the features Tarantella currently supports and +what important points need to be taken into account when using Tarantella. + +Code example: LeNet-5 on MNIST +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +After having :ref:`build and installed ` Tarantella +we are ready to add distributed training support to an existing TensorFlow 2/Keras model. +We will first illustrate all the necessary steps, using the well-known example of +**LeNet-5** on the **MNIST** dataset. Although this is not necessarily a good use case +to take full advantage of Tarantella's capabilities, it will allow you to simply +copy-paste the code snippets and try them out, even on your laptop. + +**Let's get started!** + +.. literalinclude:: quick_start_model.py + :language: Python + :linenos: + :emphasize-lines: 3,9,13 + +As you can see from the marked lines in the code snippet, +you only need to add *3 lines of code* to train LeNet-5 distributedly using Tarantella! +Let us go through the code in some more detail, in order to understand what is going on. + +First we need to import the Tarantella library: + +.. code-block:: Python + + import tarantella as tnt + +Having done that we need to initialize the library (which will setup the communication infrastructure): + +.. code-block:: Python + + tnt.init() + +Note that this should be done before executing any other code. Next, we need to wrap the +``keras.Model`` object, generated by ``lenet5_model_generator()``, into a ``tnt.Model`` object: + +.. code-block:: Python + + model = tnt.Model(lenet5_model_generator()) + +**That's it!** + +All the necessary steps to distribute training and datasets will now automatically be handled by Tarantella. +In particular, we still run ``model.compile`` on the new ``model`` to generate a compute graph, +just as we would have done with a typical Keras model. + +Next, we load the MNIST data for training and testing, and +create ``Dataset`` s from it. Note that we ``batch`` the dataset for training. +This will guarantee that Tarantella is able to distribute the data later on in the correct way. +Also note that the ``batch_size`` used here, is the same as for the original model, +that is the *global* batch size. For details concerning local and global batch sizes have a look +:ref:`here `. + +Now we are able to train our ``model`` using ``model.fit``, in the same familiar +way used by the standard Keras interface. Note, however, that Tarantella is taking care of proper +distribution of the ``train_dataset`` in the background. All the possibilities of how to +feed datasets to Tarantella are explained in more detail below. +Lastly, we can evaluate the final accuracy of our ``model`` on the ``test_dataset`` using +``model.evaluate``. + +To test and run ``tarantella`` in the next section, you can find a full version of the above example +`here `__. + +Executing your model with ``tarantella`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Next, let's execute our model distributedly using ``tarantella`` on the command line. +The simplest way to do that is by passing the Python script of the model to ``tarantella``: + +.. code-block:: bash + + tarantella -- model.py + +This will execute our model distributedly on a single node, using all the available GPUs. +In case no GPUs can be found, ``tarantella`` will executed in serial mode on the CPU, +and an ``WARNING`` message will be issued. In case you have GPUs available, but +want to execute ``tarantella`` on CPUs nonetheless, you can specify the ``--no-gpu`` option. + +.. code-block:: bash + + tarantella --no-gpu -- model.py + +We can also set command line parameters for the python script ``model.py``, which have to +succeed the name of the script: + +.. code-block:: bash + + tarantella --no-gpu -- model.py --batch_size=64 --learning_rate=0.01 + +On a single node, we can also explicitly specify the number of TensorFlow instances +we want to use. This is done with the ``-n`` option: + +.. code-block:: bash + + tarantella -n 4 -- model.py --batch_size=64 + +Here, ``tarantella`` would try to execute distributedly on 4 GPUs. +If there are not enough GPUs available, ``tarantella`` will print a ``WARNING`` +and run 4 instances of TensorFlow on the CPU instead. +If there are no GPUs installed or the ``--no-gpu`` option is use, +``tarantella`` will not print a ``WARNING``. + +Next, let's run ``tarantella`` on multiple nodes. In order to do this, +we need to provide ``tarantella`` with a ``hostfile`` that contains +the ``hostname`` s of the nodes that we want to use: + +.. code-block:: bash + + $ cat hostfile + name_of_node_1 + name_of_node_2 + +With this ``hostfile`` we can run ``tarantella`` on multiple nodes: + +.. code-block:: bash + + tarantella --hostfile hostfile -- model.py + +In this case, ``tarantella`` uses *all* GPUs it can find. +If no GPUs are available, ``tarantella`` will start *one* TensorFlow instance +per node on the CPUs, and will issue an ``WARNING`` message. +Again, this can be disabled by explicitly using the ``--no-gpu`` +option. + +As before, you can specify the number of GPUs/CPUs used per node +explicitly with the option ``--n-per-node=``: + +.. code-block:: bash + + tarantella --hostfile hostfile --n-per-node=4 --no-gpu -- model.py --batch_size=64 + +In this example, ``tarantella`` would execute 4 instances of TensorFlow on the CPUs +of each node specified in ``hostfile``. + +.. caution:: + + ``tarantella`` requires all the names in the ``hostfile`` be **unique**, + and all nodes be **homogeneous** (number and type of CPUs and GPUs). + +In addition, ``tarantella`` can be run with different levels of logging output. +The log-levels that are available are ``INFO``, ``WARNING``, ``DEBUG`` and ``ERROR``, +and can be set with ``--log-level``: + +.. code-block:: bash + + tarantella --hostfile hostfile --log-level=INFO -- model.py + +By default, ``tarantella`` will log on the :ref:`master rank ` only. +This can be changed by using the ``--log-on-all-devices`` option which will print +log messages for each :ref:`rank ` individually. + +Similarly, by default ``tarantella`` will print outputs from functions like ``fit``, +``evaluate`` and ``predict``, as well as callbacks only on the master rank. +Sometimes, it might be useful to print outputs from all devices (e.g., for debugging), +which can be switched on with the ``--output-on-all-devices`` option. + +``tarantella`` uses GPI-2's ``gaspi_run`` internally, taking care of ``export`` ing +environment variables, and generating an execution script from the user inputs. +Details of this process can be monitored using the ``--dry-run`` option. + +Lastly, you can overwrite the *Tensor Fusion* threshold ``tarantella`` uses +with ``--fusion-threshold FUSION_THRESHOLD_KB`` +(cf. :ref:`here ` and :ref:`here `), +and set and number of environment variables, most notably +``TNT_TENSORBOARD_ON_ALL_DEVICES``, as explained +:ref:`here `. + +Save and load Tarantella models +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Storing and loading your trained ``Tarantella.Model`` is very simple. + +Tarantella supports all the different ways, in which you can load and store a ``keras.Model`` +(for a guide look for instance `here `__). +In particular, you can: + +* save the whole model (including the architecture, the weights and the state of the optimizer) +* save the model's architecture/configuration only +* save the model's weights only + +Whole-model saving and loading +------------------------------ + +Saving the entire model including the architecture, weights and optimizer can be done via + +.. code-block:: python + + model = ... # get `tnt.Model` + model.save('path/to/location') + +Alternatively, you could use ``tnt.models.save_model('path/to/location')``, which works +on both ``keras.Model`` s and ``tnt.Model`` s. + +You can than load your model back using + +.. code-block:: python + + import tarantella as tnt + model = tnt.models.load_model('path/to/location') + +which will return an instance of ``tnt.Model``. + +.. caution:: + + At the moment, you will need to re-compile your model after loading. + +This is again done with + +.. code-block:: python + + model.compile(optimizer = keras.optimizers.SGD(learning_rate=args.learning_rate), + loss = keras.losses.SparseCategoricalCrossentropy(), + metrics = [keras.metrics.SparseCategoricalAccuracy()]) + +or similar. + +Architecture saving and loading +------------------------------- + +If you only want to save the configuration (that is the architecture) of your model +(in memory), you can use one of the following functions: + +* ``tnt.Model.get_config`` +* ``tnt.Model.to_json`` +* ``tnt.Model.to_yaml`` + +The architecture without its original weights and optimizer can then be restored +using: + +* ``tnt.models.model_from_config`` / ``tnt.Model.from_config`` +* ``tnt.models.model_from_json`` +* ``tnt.models.model_from_yaml`` + +respectively. +Here is an example: + +.. code-block:: python + + import tarantella as tnt + model = ... # get `tnt.Model` + config = model.get_config() + new_model = tnt.models.model_from_config(config) + +The same can be achieved through cloning: + +.. code-block:: python + + import tarantella as tnt + model = ... # get `tnt.Model` + new_model = tnt.models.clone_model(model) + + +Weights saving and loading +-------------------------- + +Storing and loading the weights of a model to/from memory can be done +using the functions ``tnt.Model.get_weights`` and ``tnt.Model.set_weights``, +respectively. Saving and loading weights to/from disk is done +using the functions ``tnt.Model.save_weights`` and ``tnt.Model.load_weights``, +respectively. + +Here is an example how this can be used to restore a model: + +.. code-block:: python + + import tarantella as tnt + model = ... # get `tnt.Model` + config = model.get_config() + weights = model.get_weights() + + # initialize a new model with original model's weights + new_model = tnt.models.model_from_config(config) + new_model.set_weights(weights) + +.. _checkpointing-via-callbacks-label: + +Checkpointing via callbacks +--------------------------- + +Apart from saving and loading models manually, Tarantella also supports checkpointing +via Keras' ``ModelCheckpoint`` callback, as it is described for instance +`here `__. + +.. code-block:: python + + import tensorflow as tf + import tarantella as tnt + + model = ... # get `tnt.Model` + + checkpoint_path = 'path/to/checkpoint/location' + model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=checkpoint_path, monitor='val_acc', verbose=1, save_best_only=False, + save_weights_only=False, mode='auto', save_freq='epoch', options=None) + + model.fit(train_dataset, + validation_data = val_dataset, + epochs = 2, + callbacks = [model_checkpoint_callback]) + + +.. note:: + + All saving to the filesystem (including ``tnt.Model.save`` and ``tnt.Model.save_weights``) + by Tarantella will only be done on the master rank. + +This is the default and will yield correct behavior when you are using a distributed filesystem. +If you wish to explicitly save on all devices you can pass ``tnt_save_all_devices = True`` +to ``tnt.Model.save``, ``tnt.Model.save_weights`` and ``tnt.models.save_model``. + + +.. _using-distributed-datasets-label: + +Using distributed datasets +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This section explains what needs to be done in order to use Tarantella's distributed datasets correctly. + +The recommended way in which to provide your dataset to Tarantella is by passing a +*batched* ``tf.data.Dataset`` to ``tnt.Model.fit``. +In order to do this, create a ``Dataset`` and apply the ``batch`` +`transformation `_ +using the (global) batch size to it. However, do not provide a value to ``batch_size`` +in ``tnt.Model.fit``, which would lead to double batching, and thus modified shapes +for the input data. + +Tarantella also supports batched and unbatched ``Dataset`` s in ``tnt.Model.fit`` +when setting the ``tnt_micro_batch_size`` argument. This can be useful to obtain +maximal performance in multi-node execution, as explained +:ref:`here `. Keep in mind however, that Tarantella still expects +the ``Dataset`` to be batched with the global batch size, and that the micro-batch +size has to be consistent with the global batch size. [#footnote_consistent]_ +This is why, it is recommended to use an unbatched ``Dataset`` when setting +a ``tnt_micro_batch_size`` explicitly. + +Tarantella does not support any other way to feed data to ``fit`` at the moment. +In particular, Numpy arrays, TensorFlow tensors and generators are not supported. + +Tarantella's automatic data distribution can be switched off by passing +``tnt_distribute_dataset=False`` in ``tnt.Model.fit``, in which case Tarantella +will issue an ``INFO`` message. +If a validation dataset is passed to ``tnt.Model.fit``, it should also be batched +with the global batch size. You can similarly switch off its automatic +micro-batching mechanism by setting ``tnt_distribute_validation_dataset=False``. + +There are a few important points when using distributed datasets in Tarantella: + +.. note:: + + Batch size must be a multiple of the number of devices used. + +This issue will be fixed in the next release. + +.. note:: + + The last incomplete batch is always dropped. + +We recommend to use ``drop_remainder=True`` when generating a ``Dataset``. +If ``drop_remainder`` is set to ``False``, Tarantella will ignore it +and issue a ``WARNING`` message. This behavior will be fixed in the next release. + +.. note:: + + When using ``shuffle`` without a ``seed``, Tarantella will use a fixed default ``seed``. + +This guarantees that the input data is shuffled the same way on all devices, +when no ``seed`` is given, which is necessary for consistency. +However, when a random ``seed`` is provided by the user, Tarantella will use that one instead. + +.. _callbacks-label: + +Callbacks +^^^^^^^^^ + +At the moment, Tarantella fully supports 3 of the +`Keras callbacks `__: + +* ``tf.keras.callbacks.LearningRateScheduler`` +* ``tf.keras.callbacks.ModelCheckpoint`` +* ``tf.keras.callbacks.TensorBoard`` + +The ``LearningRateScheduler`` takes a ``schedule`` which will change the learning rate +on each of the devices used (for detailed explanation, cf. +`here `__ +and +`here `__ +). +If ``verbose=1`` is set, Tarantella will only print on one device by default. +This behavior can be changed by passing ``--output-on-all-devices`` to ``tarantella``. + +``ModelCheckpoint`` can be used to automatically checkpoint the state of the model +during training. For an example look :ref:`here `, +and into the +`Keras documentation `__. + +The ``TensorBoard`` callback can be used to collect training information for visualization +in `TensorBoard `__. By default, Tarantella +will only collect (device local) information on one device. If you want to collect +the local information on all devices use the environment variable ``TNT_TENSORBOARD_ON_ALL_DEVICES``: + +.. code-block:: bash + + TNT_TENSORBOARD_ON_ALL_DEVICES=true tarantella -- model.py + +.. note:: + + At the moment, all of the other Keras callbacks will be executed on all devices with + local information only. + +For instance, the ``BaseLogger`` callback will be executed on each and every rank, +and will log the acculumated metric averages for the local (micro-batch) information. + +Important points +^^^^^^^^^^^^^^^^ + +There is a number of points you should be aware of when using Tarantella. + +.. note:: + + ``tnt.init()`` needs to be called **after** ``import tarantella as tnt``, but **before** + any other statement. + +This will make sure the GPI-2 communication infrastructure is correctly initialized. + +.. note:: + + Tarantella does not support custom training loops. + +Instead of using custom training loops, please use ``Model.fit(...)``. + +.. note:: + + Tarantella supports all + `TensorFlow optimizers `_ + with the exception of ``tf.keras.optimizers.Ftrl``. + +Since the ``Ftrl`` optimizer does not use batches, it is not supported in Tarantella. + + +.. rubric:: Footnotes + +.. [#footnote_consistent] That is, the global batch size must equal the micro batch size times + the number of devices used. diff --git a/docs/source/quick_start_model.py b/docs/source/quick_start_model.py new file mode 100644 index 00000000..7a345bae --- /dev/null +++ b/docs/source/quick_start_model.py @@ -0,0 +1,39 @@ +import tensorflow as tf +from tensorflow import keras +import tarantella as tnt + +# Skip function implementations for brevity +[...] + +# Initialize Tarantella (before doing anything else) +tnt.init() +args = parse_args() + +# Create Tarantella model +model = tnt.Model(lenet5_model_generator()) + +# Compile Tarantella model (as with Keras) +model.compile(optimizer = keras.optimizers.SGD(learning_rate=args.learning_rate), + loss = keras.losses.SparseCategoricalCrossentropy(), + metrics = [keras.metrics.SparseCategoricalAccuracy()]) + +# Load MNIST dataset (as with Keras) +shuffle_seed = 42 +(x_train, y_train), (x_val, y_val), (x_test, y_test) = \ + mnist_as_np_arrays(args.train_size, args.val_size, args.test_size) + +train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) +train_dataset = train_dataset.shuffle(len(x_train), shuffle_seed) +train_dataset = train_dataset.batch(args.batch_size) +train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE) + +test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)) +test_dataset = test_dataset.batch(args.batch_size) + +# Train Tarantella model (as with Keras) +model.fit(train_dataset, + epochs = args.number_epochs, + verbose = 1) + +# Evaluate Tarantella model (as with Keras) +model.evaluate(test_dataset, verbose = 1) diff --git a/docs/source/tutorials.rst b/docs/source/tutorials.rst new file mode 100644 index 00000000..c7c9b189 --- /dev/null +++ b/docs/source/tutorials.rst @@ -0,0 +1,304 @@ +Tutorials +========= + +This section delves into more advanced usage of Tarantella with the help of +state-of-the-art models for two widely-used applications in Deep Learning: + +* Image classification: ResNet-50 +* Machine translation: Transformer + +The models shown here are adapted from the +`TensorFlow Model Garden `_. +While the model implementations and hyperparameters are unchanged to preserve +compatibility with the TensorFlow official models, we provide simplified training +schemes that allow for a seemless transition from basic serial training to distributed +data parallelism using Tarantella. + + +Prerequisites +------------- + +The tutorial models can be downloaded from the +`Tnt Models repository `_ + +.. code-block:: bash + + export TNT_MODELS_PATH=/your/installation/path + cd ${TNT_MODELS_PATH} + git clone https://github.com/cc-hpc-itwm/tarantella_models + +To use these models, install the the following dependencies: + +* TensorFlow 2.2.1 +* Tarantella 0.6.0 + +For a step-by-step installation, follow the :ref:`installation-label` guide. +In the following we will assume that TensorFlow was installed in a ``conda`` +environment called ``tarantella``. + +Now we can install the final dependency, +`TensorFlow official Model Garden `__: + +.. code-block:: bash + + conda activate tarantella + pip install tf-models-official==2.2.1 + + +.. _resnet50-label: + +ResNet-50 +--------- + +Deep Residual Networks (ResNets) represented a breakthrough in the field of +computer vision, enabling deeper and more complex deep convolutional networks. +Introduced in [He]_, ResNet50 has become a standard model for image classification +tasks, and has be shown to scale to very large number of nodes in data parallel +training [Goyal]_. + +Run Resnet-50 with Tarantella +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Before running the model, we need to add it to the existing ``PYTHONPATH``. + +.. code-block:: bash + + export PYTHONPATH=${TNT_MODELS_PATH}/models/resnet:${PYTHONPATH} + +Furthermore, the ``ImageNet`` dataset needs to be installed and available on +all the nodes that we want to use for training. +TensorFlow provides convenience scripts to download datasets, in their ``datasets`` +package that is installed as a dependency for the TensorFlow Model Garden. +Install ImageNet to your local machine as described +`here `_. + +.. code-block:: bash + + export TNT_DATASETS_PATH=/path/to/downloaded/datasets + + python -m tensorflow_datasets.scripts.download_and_prepare \ + --datasets=imagenet2012 --data_dir=${TNT_DATASETS_PATH} + + +Let's assume we have access to two nodes (saved in ``hostfile``) equipped with 4 GPUs each. +We can now simply run the ResNet-50 as follows: + +.. code-block:: bash + + tarantella --hostfile ./hostfile --devices-per-node 4 \ + -- ${TNT_MODELS_PATH}/models/resnet/resnet50_tnt.py --data_dir=${TNT_DATASETS_PATH} \ + --batch_size=512 \ + --train_epochs=90 \ + --epochs_between_evals=10 + +The above command will train a ResNet-50 models on the 8 devices available in parallel +for ``90`` epochs, as suggested in [Goyal]_ to achieve convergence. +The ``--epochs_between_evals`` parameter specifies the frequency of evaluations of the +``validation`` data performed in between training epochs. + +Note the ``--batch_size`` parameter, which specifies the global batch size used in training. + +Implementation overview +^^^^^^^^^^^^^^^^^^^^^^^ +We will now look closer into the implementation of the ResNet-50 training scheme. +The main training steps reside in the ``models/resnet/resnet50_tnt.py`` file. + +The most important step in enabling data parallelism with Tarantella is +to wrap the Keras model: + +.. code-block:: python + + model = resnet_model.resnet50(num_classes=tf_imagenet_preprocessing.NUM_CLASSES) + model = tnt.Model(model) + +The following operations can be used for training the model serially, as they do not +require any change. +In particular, the ImageNet dataset is loaded and preprocessed as follows: + +.. code-block:: python + + train_dataset = imagenet_preprocessing.input_fn(is_training=True, + data_dir=flags_obj.data_dir, + batch_size=flags_obj.batch_size, + shuffle_seed = 42, + drop_remainder=True) + +The +`imagenet_preprocessing.input_fn +`_ +function takes the input files in ``data_dir``, loads the training samples and processes +them into TensorFlow datasets. + +The user only needs to pass the global ``batch_size`` value, and the Tarantella +framework will ensure that the dataset is properly distributed among devices, +such that: + + * each device will process an independent set of samples + * each device will group the samples into micro batches, where the micro-batch + size will be computed as ``batch_size / num_devices`` + * each device will apply the same set of transformation to its imput samples as + specified in the ``input_fn`` function. + +Before starting the training, the model is compiled to use a standard Keras optimizer +and loss. + +.. code-block:: python + + model.compile(optimizer=optimizer, + loss='sparse_categorical_crossentropy', + metrics=(['sparse_categorical_accuracy'])) + +We provide flags to enable the most commonly used Keras ``callbacks``, such as +the ``TensorBoard`` profiler, which can simply be passed to the ``fit`` function +of the Tarantella model. + +.. code-block:: python + + callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=flags_obj.model_dir, + profile_batch=2)) + +If model checkpointing is required, it can be enabled through the ``ModelCheckpoint`` +callback as usual (cf. :ref:`checkpointing models with Tarantella `). + +.. code-block:: python + + callbacks.append(tf.keras.callbacks.ModelCheckpoint(ckpt_full_path, save_weights_only=True)) + + +There is no need for any further changes to proceed with training: + +.. code-block:: python + + history = model.fit(train_dataset, + epochs=flags_obj.train_epochs, + callbacks=callbacks, + validation_data=validation_dataset, + validation_freq=flags_obj.epochs_between_evals, + verbose=1) + +.. todo:: + + Advanced topics: + + * scaling batch size with number of ranks (-> only mention here & link to advanced topics) + * introduce learning rate warm up + * introduce learning rate scaling (with #ranks) + + +.. _transformer-label: + +Transformers +------------ + +The Transformer is a Deep Neural Network widely used in the field of natural language processing (NLP), +in particular for tasks such as machine translation. +It was first proposed by [Vaswani]_. + +Run the Transformer with Tarantella +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The Tranformer training scheme can be found +`here `__, +and has to be added to +the existing ``PYTHONPATH``: + +.. code-block:: bash + + export PYTHONPATH=${TNT_MODELS_PATH}/models/transformer:${PYTHONPATH} + +We will follow the training procedure presented in [Vaswani]_, where the authors +show results for training the `big` variant of the Transformer model on +a machine translation dataset called +`WMT14 `_. + +To install the dataset, we will use the Tensorflow ``datasets`` package, which +should have been already installed in your ``conda`` environment as a +dependency for the TensorFlow Model Garden, and download the English-German +dataset to match the results by [Vaswani]_. +Detailed instructions on how to obtain the dataset are provided in the +`TensorFlow documentation `_. + +Now we can start training. +Once again, let's assume we have access to two nodes (specified in ``hostfile``) +equipped with 4 GPUs each. + +.. code-block:: bash + + export WMT14_PATH=/path/to/the/installed/dataset + + tarantella --hostfile ./hostfile --devices-per-node 4 \ + -- ${TNT_MODELS_PATH}/models/transformer/transformer_tnt.py \ + --data_dir=${WMT14_PATH} \ + --vocab_file=${WMT14_PATH}/vocab.ende.32768 + --bleu_ref=${WMT14_PATH}/newstest2014.de + --bleu_source=${WMT14_PATH}/newstest2014.en + --param_set=big + --train_epochs=30 + --batch_size=32736 + +The above command will select the ``big`` model implementation and train it +distributedly on the 8 specified devices. +To reach the target accuracy, [Vaswani]_ specifies that the model needs to be +trained for ``30`` epochs. + +The Transformer requires access to a vocabulary file, which contains all the +tokens derived from the dataset. This is provided as the ``vocab_file`` parameter +and is part of the pre-processed dataset. + +After training, one round of evaluation is conducted using the ``newstest2014`` +dataset to translate English sentences into German. + +Implementation overview +^^^^^^^^^^^^^^^^^^^^^^^ + +The Transformer model itself is implemented and imported from the +`TensorFlow Model Garden +`__. +The training procedure and dataset loading and pre-processing do not require +extensive changes to work with Tarantella. However, we provide a simplified +version to highlight the usage of Tarantella with Keras training loops. + +Thus, the Keras transformer model is created in +``models/transformer/transformer_tnt.py`` and wrapped into a Tarantella model: + +.. code-block:: python + + model = resnet_model.resnet50(num_classes=tf_imagenet_preprocessing.NUM_CLASSES) + model = tnt.Model(model) + +Data is loaded as follows, without any specific modification to trigger +distributed training: + +.. code-block:: python + + train_ds = data_pipeline.train_input_fn(self.params) + +Here, the ``data_pipeline.train_input_fn`` reads in the dataset and applies a series +of transformations to convert it into a batched set of sentences. +The advantage of using the *automatic dataset distribution* mechanism of Tarantella +is that users can reason about their I/O pipeline without taking care of the details +about how to distribute it. +Note however, that the batch size has to be a multiple of the number of ranks, so +that it can be efficiently divided into micro-batches. + +Next, the user can also create callbacks, which can then be simply passed on to +the training function. + +.. code-block:: python + + callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=self.flags_obj.model_dir)) + +Finally, we can call ``model.fit`` to start distributed training on all devices: + +.. code-block:: python + + history = model.fit(train_ds, + epochs=self.params["train_epochs"], + callbacks=callbacks, + verbose=1) + +.. todo:: + + Important points + + * Mixing Keras and Tarantella models + diff --git a/docs/source/why_tarantella.rst b/docs/source/why_tarantella.rst new file mode 100644 index 00000000..4313dd35 --- /dev/null +++ b/docs/source/why_tarantella.rst @@ -0,0 +1,44 @@ +Why Tarantella? +=============== + +Tarantella is an open-source Deep Learning framework that focuses on providing fast, scalable and +efficient training of Deep Neural Networks (DNNs) on High Performance Computing (HPC) clusters. + +Goals +----- + +Tarantella is designed to meet the following goals: + +.. code-block:: text + + Tarantella... + + 1. ...provides strong scalability + 2. ...is easy to use + 3. ...follows a synchronous training scheme + 4. ...integrates well with existing models + 5. ...provides support for GPU and CPU systems + +Tarantella provides close to linear speed-up for the training of common Deep Learning architectures, +thus considerably reducing the required time-to-accuracy in many Deep Learning workflows. +To make this capability accessible to as many users as possible, Tarantella's interface +is designed such that its use does not require any expertise in HPC or parallel computing. + +To allow integrating Tarantella into any TensorFlow-based Deep Learning workflow, +we put special emphasis on strictly following the synchronous optimization scheme +used to train DNNs. This guarantees that results obtained in serial execution can be +reproduced when using distributed training +(cf. however :ref:`these guidelines `), +so that computation can be scaled up at any point in time without losing reproducibility +of the results. + +Furthermore, we made sure that existing TensorFlow 2/Keras +models can be made ready for distributed training with minimal effort +(follow the :ref:`Quick Start guide ` to learn more). +Tarantella supports distributed training on GPU and pure CPU clusters, +independently of the hardware vendors. + +.. todo:: + + Performance Results + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 00000000..4870d072 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,17 @@ +set(TNT_PYTHON_DIRS + ${SRC_DIR}/tarantella + ${SRC_DIR}/runtime + ${SRC_DIR}/gpi_comm_lib/tf_ops/tnt_tfops) + +install(DIRECTORY ${TNT_PYTHON_DIRS} + DESTINATION ${INSTALL_LIB_DIR}/python + FILES_MATCHING PATTERN "*.py") + +install(PROGRAMS ${SRC_DIR}/bin/tarantella + DESTINATION ${INSTALL_BIN_DIR}) + +set(VERSION_FILE_TEMPLATE ${CMAKE_SOURCE_DIR}/cmake/version.py.in) +set(VERSION_FILE ${CMAKE_BUILD_DIR}/version.py) +configure_file(${VERSION_FILE_TEMPLATE} ${VERSION_FILE} @ONLY) +install(FILES ${VERSION_FILE} + DESTINATION ${INSTALL_LIB_DIR}/python) diff --git a/src/bin/tarantella b/src/bin/tarantella new file mode 100755 index 00000000..950e3531 --- /dev/null +++ b/src/bin/tarantella @@ -0,0 +1,198 @@ +#!/usr/bin/env python +import argparse +import logging +import os +import shutil +import subprocess +import sys + +TNT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +LIB_DIR = os.path.join(TNT_DIR, "lib/tarantella") +PYLIB_DIR = os.path.join(TNT_DIR, "lib/tarantella/python") +sys.path.insert(0, LIB_DIR) +sys.path.insert(0, PYLIB_DIR) + +try: + from version import tnt_version +except: + tnt_version = "Unknown version" + +try: + import runtime +except ModuleNotFoundError as e: + raise RuntimeError("[TNT_CLI] Cannot find Tarantella `runtime` module; \ +make sure the `tarantella` script is started from an installed version.") from e + +import runtime.file_management as file_man +import runtime.logging_config as logging_config +import runtime.platform_config as platform_config +import runtime.environment_config as env_config +from runtime import logger + +def parse_args(): + parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter) + singlenode_group = parser.add_argument_group('Single-node execution') + singlenode_group.add_argument("-n", + help="number of TensorFlow instances to start on the local node", + dest = "npernode", + metavar = "N", + type = int, + default = None) + multinode_group = parser.add_argument_group('Multi-node execution') + multinode_group.add_argument("--hostfile", + dest = "hostfile", + help="path to the list of nodes (hostnames) on which to execute the SCRIPT", + default = None) + multinode_group.add_argument("--n-per-node", "--devices-per-node", + help="number of devices (i.e., either GPUs or processes on CPUs) to be used on each node", + dest = "npernode", + type = int, + default = None) + + parser.add_argument("--no-gpu", "--no-gpus", + help="disallow GPU usage", + dest = "use_gpus", + action='store_false', + default = True) + parser.add_argument("--output-on-all-devices", + help="enable output on all devices (e.g., training info)", + dest = "output_all", + action='store_true', + default = False) + parser.add_argument("--log-on-all-devices", + help="enable library logging messages on all devices", + dest = "log_all", + action='store_true', + default = False) + log_levels = ('DEBUG', 'INFO', 'WARNING', 'ERROR') + parser.add_argument('--log-level', default='WARNING', choices=log_levels, + help = "logging level for library messages") + parser.add_argument("--fusion-threshold", + help="tensor fusion threshold [kilobytes]", + dest = "fusion_threshold_kb", + type = int, + default = None) + parser.add_argument("--dry-run", + help="print generated files and execution command", + dest = "dry_run", + action='store_true', + default = False) + parser.add_argument("--version", + action='version', + version=generate_version_message()) + parser.add_argument('script', nargs='+',metavar='-- SCRIPT') + args = parser.parse_args() + return args + +def tnt_run_message(command_list, hostfile_path, exec_script_path): + msg = "" + if not hostfile_path is None: + msg += "\n{}\nGenerated hostfile:\n".format("="*80) + with open(hostfile_path, 'r') as f: + msg += "============= {} =============\n{}\n".format(hostfile_path, + "".join(f.readlines())) + if not exec_script_path is None: + msg += "\n{}\nGenerated script:\n".format("="*80) + with open(exec_script_path, 'r') as f: + msg += "============= {} =============\n{}\n".format(exec_script_path, + "".join(f.readlines())) + msg += "\n{}".format("="*80) + msg += "\nCommand:\n\t{}\n".format(" ".join(command_list)) + return msg + +def generate_dry_run_message(command_list, hostfile_path, exec_script_path): + msg = "\n{}".format("="*80) + msg += "\n{0}{1}DRY RUN {1}{0}\n".format("="*6, " "*30) + msg += tnt_run_message(command_list, hostfile_path, exec_script_path) + return msg + +def generate_run_error_message(e, hostfile_path = None, + executed_script_path = None): + error_string = "" + if not e.stdout is None: + error_string += "============= STDOUT =============\n{}\n".format(e.stdout) + if not e.stderr is None: + error_string += "============= STDERR =============\n{}\n".format(e.stderr) + error_string += tnt_run_message(e.cmd, hostfile_path = hostfile_path, + exec_script_path = executed_script_path) + error_string += "[TNT_CLI] Execution failed with status {}".format(e.returncode) + return error_string + +def generate_version_message(): + msg = ["Tarantella {}".format(tnt_version), + "Path: {}".format(os.path.dirname(os.path.abspath(__file__))), + "Copyright (C) 2020 Fraunhofer"] + return "\n".join(msg) + +class Tarantella: + def __init__(self, hostlist, num_gpus_per_node, num_cpus_per_node, args): + self.args = args + + self.hostlist = hostlist + self.command_list = args.script + self.num_gpus_per_node = num_gpus_per_node + + # compute number of ranks per node to create the hostfile + npernode = num_gpus_per_node + device_type = "GPUs" + if npernode == 0: + npernode = num_cpus_per_node + device_type = "CPU processes" + + self.nranks = len(hostlist) * npernode + self.hostfile = file_man.HostFile(self.hostlist, npernode) + self.executable_script = self.generate_executable_script() + + logger.info("Starting Tarantella on {} devices ({} nodes x {} {})".format(self.nranks, + len(self.hostlist), npernode, device_type)) + + + def generate_executable_script(self): + # create execution script + header = "#!/bin/bash\n" + header += "cd {}".format(os.path.abspath(os.getcwd())) + + environment = env_config.gen_exports_from_dict(env_config.collect_environment_variables()) + \ + env_config.gen_exports_from_dict(env_config.collect_tensorflow_variables()) + \ + env_config.gen_exports_from_dict(env_config.collect_tarantella_variables()) + \ + env_config.gen_exports_from_dict(env_config.get_tnt_variables_from_args(self.args)) +\ + env_config.gen_exports_from_dict(env_config.get_tnt_gpus(self.num_gpus_per_node)) + + command = "python {}".format(' '.join(self.command_list)) + return file_man.GPIScriptFile(header, environment, command, dir = os.getcwd()) + + def run(self, dry_run = False): + with self.hostfile, self.executable_script: + command_list = ["gaspi_run", "-n", str(self.nranks), + "-m", self.hostfile.name, + self.executable_script.filename] + + if dry_run: + print(generate_dry_run_message(command_list, self.hostfile.name, + self.executable_script.filename)) + return + + path_to_gpi = shutil.which("gaspi_run") + if path_to_gpi is None: + sys.exit("[TNT_CLI] Cannot execute `gaspi_run`; make sure it is added to the current `PATH`.") + + try: + result = subprocess.run(command_list, + check = True, + cwd = os.getcwd(), + stdout = None, stderr = None,) + except subprocess.CalledProcessError as e: + sys.exit(generate_run_error_message(e, self.hostfile.name, + self.executable_script.filename)) + +if __name__ == "__main__": + args = parse_args() + logging_config.setup_logging(logger, args.log_level) + + nodes_list = platform_config.generate_nodes_list(args.hostfile) + num_gpus, num_cpus = platform_config.generate_num_devices_per_node(npernode = args.npernode, + use_gpus = args.use_gpus) + env_config.update_environment_paths(LIB_DIR) + + tarantella = Tarantella(nodes_list, num_gpus, num_cpus, args) + tarantella.run(args.dry_run) \ No newline at end of file diff --git a/src/gpi_comm_lib/AtomicCondition.hpp b/src/gpi_comm_lib/AtomicCondition.hpp new file mode 100644 index 00000000..99fc54d3 --- /dev/null +++ b/src/gpi_comm_lib/AtomicCondition.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include +#include + +class AtomicCondition +{ + public: + void notify() + { + { + std::lock_guard lk(lock); + done = true; + } + condition.notify_one(); + } + + void wait() + { + std::unique_lock lk(lock); + condition.wait(lk, [&done = done]{return done;}); + done = false; + } + + private: + std::mutex lock; + std::condition_variable condition; + bool done; +}; \ No newline at end of file diff --git a/src/gpi_comm_lib/CMakeLists.txt b/src/gpi_comm_lib/CMakeLists.txt new file mode 100644 index 00000000..f31227f3 --- /dev/null +++ b/src/gpi_comm_lib/CMakeLists.txt @@ -0,0 +1,34 @@ +include (add_macros) + +set(GPI_LIB_MODULE "GPICommLib") + +set (GPICOMMLIB_SOURCES + ${SRC_DIR}/gpi_comm_lib/distribution/SegmentIDBuilder.cpp + ${SRC_DIR}/gpi_comm_lib/distribution/utilities.cpp + ${SRC_DIR}/gpi_comm_lib/PipelineCommunicator.cpp + ${SRC_DIR}/gpi_comm_lib/SynchCommunicator.cpp + ${SRC_DIR}/gpi_comm_lib/TensorBroadcaster.cpp +) + +extended_add_library(NAME gpicommlib + NAMESPACE tnt + TYPE SHARED + SOURCES + ${GPICOMMLIB_SOURCES} + LIBRARIES + tnt::gpiresources + tnt::collectives + INCLUDE_DIRECTORIES + ${SRC_DIR}/gpi_comm_lib/ + INSTALL + INSTALL_DESTINATION + ${INSTALL_LIB_DIR} + POSITION_INDEPENDENT) + +pybind11_add_module(${GPI_LIB_MODULE} MODULE + ${SRC_DIR}/gpi_comm_lib/pybind11_wrappers.cpp) +target_link_libraries(${GPI_LIB_MODULE} PRIVATE pybind11::module + tnt::gpicommlib) +install(TARGETS ${GPI_LIB_MODULE} + LIBRARY + DESTINATION ${INSTALL_LIB_DIR}) diff --git a/src/gpi_comm_lib/PipelineCommunicator.cpp b/src/gpi_comm_lib/PipelineCommunicator.cpp new file mode 100644 index 00000000..cf4c328d --- /dev/null +++ b/src/gpi_comm_lib/PipelineCommunicator.cpp @@ -0,0 +1,116 @@ +#include "PipelineCommunicator.hpp" + +#include "collectives/barrier/GPIBarrier.hpp" +#include "distribution/GroupBuilder.hpp" +#include "gpi/gaspiCheckReturn.hpp" + +#include + +#include + +namespace tarantella +{ + PipelineCommunicator::PipelineCommunicator( + GPI::Context& context, + std::unordered_map const& connection_infos, + std::size_t num_micro_batches) + : resource_manager(context.get_resource_manager()) + { + for(auto const& [conn_id, conn_info] : connection_infos) + { + auto const segment_id = conn_info.segment_id; + auto const buffer_size = conn_info.microbatched_buffer_size_bytes; + auto const segment_size = 2 * num_micro_batches * buffer_size; + + auto const segment_group = resource_manager.make_group({context.get_rank(), conn_info.other_rank}); + resource_manager.make_segment_resources(segment_id, segment_group, segment_size); + + std::vector send_bufs; + std::vector recv_bufs; + std::vector notifications; + for(std::size_t m_id = 0; m_id < num_micro_batches; ++m_id) + { + send_bufs.push_back(resource_manager.get_buffer_of_size(segment_id, buffer_size)); + recv_bufs.push_back(resource_manager.get_buffer_of_size(segment_id, buffer_size)); + notifications.push_back(resource_manager.get_notification_range(segment_id, 1).first); + } + connections.emplace(conn_id, SendRecvResources(conn_info.other_rank, + send_bufs, + recv_bufs, + notifications)); + } + + // Barrier is required, to ensure all ranks have finished registering + // their segments to their communication partners + collectives::Barrier::GPIBarrierAllRanks barrier; + barrier.blocking_barrier(); + } + + void PipelineCommunicator::non_blocking_send(void* local_send_buf, + ConnectionID conn_id, + MicrobatchID micro_id) + { + auto const& local_segment_buf = connections[conn_id].send_bufs[micro_id]; + auto const& remote_segment_buf = connections[conn_id].recv_bufs[micro_id]; + + copy_data_to_segment(local_send_buf, local_segment_buf); + + GPI::gaspiCheckReturn( + gaspi_write_notify(local_segment_buf.get_segment_id(), + local_segment_buf.get_offset(), + connections[conn_id].other_rank, + remote_segment_buf.get_segment_id(), + remote_segment_buf.get_offset(), + local_segment_buf.get_size(), + connections[conn_id].notifications[micro_id], + micro_id + 1, // to check micro_id at recv (must not be zero) + resource_manager.get_queue_id_for_write_notify(), + GASPI_BLOCK), + "PipelineCommunicator::non_blocking_send"); + } + + void PipelineCommunicator::blocking_recv(void* local_recv_buf, + ConnectionID conn_id, + MicrobatchID micro_id) + { + auto const& local_segment_buf = connections[conn_id].recv_bufs[micro_id]; + gaspi_notification_id_t received_notification_id = 0; + gaspi_notification_t received_notification_value = 0; + + GPI::gaspiCheckReturn( + gaspi_notify_waitsome(local_segment_buf.get_segment_id(), + connections[conn_id].notifications[micro_id], + 1, + &received_notification_id, + GASPI_BLOCK), + "PipelineCommunicator::blocking_recv : gaspi_notify_waitsome"); + GPI::gaspiCheckReturn( + gaspi_notify_reset(local_segment_buf.get_segment_id(), + received_notification_id, + &received_notification_value), + "PipelineCommunicator::blocking_recv : gaspi_notify_reset"); + if (received_notification_value != micro_id + 1) + { + throw std::runtime_error("PipelineCommunicator::blocking_recv : \ + Incorrect notification value received"); + } + + copy_data_from_segment(local_recv_buf, local_segment_buf); + } + + void PipelineCommunicator::copy_data_to_segment(void* local_send_buf, + GPI::SegmentBuffer const& segment_buffer) + { + auto const segment_ptr = segment_buffer.get_ptr(); + auto const buffer_size = segment_buffer.get_size(); + std::memcpy(segment_ptr, local_send_buf, buffer_size); + } + + void PipelineCommunicator::copy_data_from_segment(void* local_recv_buf, + GPI::SegmentBuffer const& segment_buffer) + { + auto const segment_ptr = segment_buffer.get_ptr(); + auto const buffer_size = segment_buffer.get_size(); + std::memcpy(local_recv_buf, segment_ptr, buffer_size); + } +} diff --git a/src/gpi_comm_lib/PipelineCommunicator.hpp b/src/gpi_comm_lib/PipelineCommunicator.hpp new file mode 100644 index 00000000..3c6effaa --- /dev/null +++ b/src/gpi_comm_lib/PipelineCommunicator.hpp @@ -0,0 +1,65 @@ +#pragma once + +#include +#include +#include + +#include +#include + +namespace tarantella +{ + class SendRecvResources + { + public: + SendRecvResources() = default; + SendRecvResources(GPI::Rank rank, + std::vector const& send_bufs, + std::vector const& recv_bufs, + std::vector const& notifications) + : other_rank(rank), send_bufs(send_bufs), recv_bufs(recv_bufs), notifications(notifications) + {} + + GPI::Rank other_rank; + std::vector send_bufs; + std::vector recv_bufs; + std::vector notifications; + }; + + class ConnectionInfo + { + public: + explicit ConnectionInfo(GPI::SegmentID segment_id, GPI::Rank other_rank, std::size_t buffer_size_bytes) + : segment_id(segment_id), other_rank(other_rank), microbatched_buffer_size_bytes(buffer_size_bytes) + {} + + GPI::SegmentID segment_id; + GPI::Rank other_rank; + std::size_t microbatched_buffer_size_bytes; + }; + + class PipelineCommunicator + { + public: + using ConnectionID = std::size_t; + using MicrobatchID = std::size_t; + + PipelineCommunicator(GPI::Context&, + std::unordered_map const&, + std::size_t num_micro_batches); + + void non_blocking_send(void* local_send_buf, + ConnectionID, + MicrobatchID); + void blocking_recv(void* local_recv_buf, + ConnectionID, + MicrobatchID); + + private: + GPI::ResourceManager& resource_manager; + std::unordered_map connections; + + void copy_data_to_segment(void* local_send_buf, GPI::SegmentBuffer const&); + void copy_data_from_segment(void* local_recv_buf, GPI::SegmentBuffer const&); + }; +} diff --git a/src/gpi_comm_lib/SynchCommunicator.cpp b/src/gpi_comm_lib/SynchCommunicator.cpp new file mode 100644 index 00000000..163635b5 --- /dev/null +++ b/src/gpi_comm_lib/SynchCommunicator.cpp @@ -0,0 +1,166 @@ +#include "SynchCommunicator.hpp" +#include "collectives/allreduce/RecursiveHalvingDoubleBuffer.hpp" + +#include +#include +#include + +namespace tarantella +{ + void SynchCommunicator::create_fused_tensor_infos_and_ids( + std::vector const& tensor_infos, + std::size_t threshold_bytes) + { + collectives::TensorFusor fusor {threshold_bytes}; + fusor.fuse_tensor_infos_and_ids(tensor_infos, fused_ids, fused_tensor_infos); + } + + void SynchCommunicator::create_fused_tensors_synchronization() + { + for(auto const& fused_info : fused_tensor_infos) + { + auto const fused_id = fused_info.first; + ready_to_start_counters[fused_id] = std::make_unique>(0UL); + finished_counters[fused_id] = std::make_unique>(0UL); + ready_to_copy_back[fused_id] = std::make_unique>(false); + ready_to_reset_counters[fused_id] = std::make_unique>(0UL); + } + } + + SynchCommunicator::SynchCommunicator(GPI::Context& context, + GPI::SegmentID segment_id, + GPI::Group const& group, + std::vector const& tensor_infos, + std::size_t threshold_for_tensor_fusion_bytes) + : resource_manager(context.get_resource_manager()), + segment_id(segment_id), + group(group), + queue_handler(), + fused_ids(), + fused_tensor_infos(), + operators(), + ready_to_start_counters(), + finished_counters(), + ready_to_copy_back(), + ready_to_reset_counters(), + setup_has_finished(), + terminate_man_thread(false), + management_thread(&tarantella::SynchCommunicator::management_thread_task, this) + { + using AllreduceImplementation = collectives::Allreduce::RecursiveHalvingDoubleBuffer; + create_fused_tensor_infos_and_ids(tensor_infos, threshold_for_tensor_fusion_bytes); + create_fused_tensors_synchronization(); + create_segment_resources(tensor_infos); + create_operators_with_state(); + setup_has_finished.notify(); + } + + SynchCommunicator::SynchCommunicator(GPI::Context& context, + GPI::SegmentID segment_id, + GPI::Group const& group, + std::vector const& tensor_infos) + : SynchCommunicator(context, segment_id, group, tensor_infos, 0UL) + { } + + SynchCommunicator::~SynchCommunicator() + { + terminate_man_thread = true; + if (management_thread.joinable()) + { + management_thread.join(); + } + } + + void SynchCommunicator::start_allreduce_impl(GradID const& grad_id, const void* data_ptr) + { + auto const fused_id = fused_ids[grad_id]; + + // All `grad_id`s copy-in their respective data + copy_data_to_segment(grad_id, data_ptr); + auto const value = ready_to_start_counters[fused_id]->fetch_add(1UL); + + // Make sure all copies are done, before last `grad_id` starts operator + if (value == fused_tensor_infos[fused_id].get_num_tensors()-1) + { + operators[fused_id].allreduce->start(); + ready_to_start_counters[fused_id]->store(0UL); + } + } + + void SynchCommunicator::finish_allreduce_impl(GradID const& grad_id, void* results_ptr) + { + auto const fused_id = fused_ids[grad_id]; + + // First `grad_id` to arrive waits for `has_finished`, and notifies + // everyone that results can be copied back + auto const num_arrived = finished_counters[fused_id]->fetch_add(1UL); + if (num_arrived == 0) + { + operators[fused_id].has_finished->wait(); + ready_to_copy_back[fused_id]->store(true); + } + + // All `grad_id`s copy-out their respective data, + // once results have been obtained + while(true) + { + if(ready_to_copy_back[fused_id]->load()) + { + copy_data_from_segment(grad_id, results_ptr); + break; + } + } + + // Make sure all copies are done, before last `grad_id` resets initial state + auto const copied_grads = ready_to_reset_counters[fused_id]->fetch_add(1UL); + if (copied_grads == fused_tensor_infos[fused_id].get_num_tensors()-1) + { + operators[fused_id].allreduce->reset_for_reuse(); + finished_counters[fused_id]->store(0UL); + ready_to_copy_back[fused_id]->store(false); + ready_to_reset_counters[fused_id]->store(0UL); + } + } + + void SynchCommunicator::copy_data_to_segment(GradID const& grad_id, const void* data_ptr) + { + auto const fused_id = fused_ids[grad_id]; + auto const segment_ptr = reinterpret_cast(operators[fused_id].allreduce->get_input_ptr()) + + fused_tensor_infos[fused_id].get_local_offset_bytes(grad_id); + std::memcpy(segment_ptr, data_ptr, fused_tensor_infos[fused_id].get_local_size_bytes(grad_id)); + } + + void SynchCommunicator::copy_data_from_segment(GradID const& grad_id, void* results_ptr) + { + auto const fused_id = fused_ids[grad_id]; + auto const segment_ptr = reinterpret_cast( operators[fused_id].allreduce->get_result_ptr()) + + fused_tensor_infos[fused_id].get_local_offset_bytes(grad_id); + std::memcpy(results_ptr, segment_ptr, fused_tensor_infos[fused_id].get_local_size_bytes(grad_id)); + } + + void SynchCommunicator::management_thread_task() + { + setup_has_finished.wait(); + while (!terminate_man_thread) + { + while (true) + { + if (terminate_man_thread) + { + break; + } + for (auto& element : operators) + { + auto& op = *(element.second.allreduce.get()); + if (op.is_finished()) continue; + + op.trigger_communication_step(); + if (op.is_finished()) + { + element.second.has_finished->notify(); + } + } + } + } + } +} diff --git a/src/gpi_comm_lib/SynchCommunicator.hpp b/src/gpi_comm_lib/SynchCommunicator.hpp new file mode 100644 index 00000000..db0a2fc7 --- /dev/null +++ b/src/gpi_comm_lib/SynchCommunicator.hpp @@ -0,0 +1,134 @@ +#pragma once + +#include "AtomicCondition.hpp" +#include "collectives/allreduce/Operator.hpp" +#include "collectives/barrier/GPIBarrier.hpp" +#include "collectives/FusedTensorInfo.hpp" +#include "collectives/TensorInfo.hpp" +#include "collectives/Types.hpp" +#include "distribution/utilities.hpp" +#include "gpi/Context.hpp" +#include "gpi/ResourceManager.hpp" +#include "queues.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace tarantella +{ + using GradID = collectives::GradID; + using FusedID = collectives::FusedID; + + class SynchCommunicator + { + public: + SynchCommunicator(GPI::Context&, GPI::SegmentID, GPI::Group const&, std::vector const&); + SynchCommunicator(GPI::Context&, GPI::SegmentID, GPI::Group const&, std::vector const&, std::size_t); + SynchCommunicator(SynchCommunicator&) = delete; + SynchCommunicator& operator=(SynchCommunicator&) = delete; + ~SynchCommunicator(); + + // TODO: Replace void* with a LocalBuffer struct {ptr, size} + void start_allreduce_impl(GradID const&, const void*); + void finish_allreduce_impl(GradID const&, void*); + + private: + struct OperatorWithState + { + std::unique_ptr allreduce; + std::unique_ptr has_finished; + }; + + static collectives::Allreduce::Operator::ReductionOp const reduction_op = collectives::Allreduce::Operator::ReductionOp::AVERAGE; + + GPI::ResourceManager& resource_manager; + GPI::SegmentID segment_id; + GPI::Group const& group; + collectives::queues queue_handler; // TODO replace with the ResourceManager + + std::unordered_map fused_ids; + std::unordered_map fused_tensor_infos; + std::unordered_map operators; + + std::unordered_map>> ready_to_start_counters; + std::unordered_map>> finished_counters; + std::unordered_map>> ready_to_copy_back; + std::unordered_map>> ready_to_reset_counters; + + AtomicCondition setup_has_finished; + std::atomic terminate_man_thread; + std::thread management_thread; + void management_thread_task(); + + void copy_data_to_segment(GradID const&, const void*); + void copy_data_from_segment(GradID const&, void*); + + void create_fused_tensor_infos_and_ids(std::vector const&, std::size_t); + void create_fused_tensors_synchronization(); + + template + constexpr float get_overhead_factor() const; + + template + void create_segment_resources(std::vector const& tensor_infos) const; + + void create_fused_tensor_infos(std::vector const &tensor_infos); + + template + std::unique_ptr create_allreduce_op(collectives::TensorInfo const&); + + template + void create_operators_with_state(); + }; + + template + constexpr float SynchCommunicator::get_overhead_factor() const + { + return 3.5; + } + + template + void SynchCommunicator::create_segment_resources(std::vector const& tensor_infos) const + { + auto const segment_size = distribution::get_segment_size(tensor_infos, get_overhead_factor()); + resource_manager.make_segment_resources(segment_id, group, segment_size); + + // Barrier is required, to ensure all ranks have finished registering + // their segments to their communication partners + collectives::Barrier::GPIBarrier barrier(group); + barrier.blocking_barrier(); + } + + template + std::unique_ptr SynchCommunicator::create_allreduce_op(collectives::TensorInfo const& tensor_info) + { + auto const required_resources = AllreduceAlgorithm::get_required_resources(tensor_info, group); + + collectives::Allreduce::Operator::ResourceList resources; + for (auto const& resource : required_resources) + { + resources.emplace_back( + resource_manager.get_buffer_of_size(segment_id, resource.buffer_size), + resource_manager.get_notification_range(segment_id, resource.num_notifications)); + } + + return std::make_unique(tensor_info, reduction_op, resources, queue_handler, group); + } + + template + void SynchCommunicator::create_operators_with_state() + { + for(auto const& fused_info : fused_tensor_infos) + { + auto const tensor_id = fused_info.first; + auto const tensor_info = fused_info.second.to_tensor_info(); + OperatorWithState op{create_allreduce_op(tensor_info), std::make_unique()}; + operators.emplace(tensor_id, std::move(op)); + } + } +} diff --git a/src/gpi_comm_lib/TensorBroadcaster.cpp b/src/gpi_comm_lib/TensorBroadcaster.cpp new file mode 100644 index 00000000..f28cc19b --- /dev/null +++ b/src/gpi_comm_lib/TensorBroadcaster.cpp @@ -0,0 +1,84 @@ +#include "TensorBroadcaster.hpp" + +#include "distribution/utilities.hpp" +#include "gpi/Context.hpp" +#include "gpi/ResourceManager.hpp" +#include "gpi/SegmentBuffer.hpp" + +#include +#include + +namespace tarantella +{ + TensorBroadcaster::TensorBroadcaster(GPI::Context& context, + GPI::SegmentID segment_id, + GPI::Group const& group, + std::vector const& tensor_infos, + GPI::Rank root_rank) + : context(context), + group(group), + queue_handler(), + root(root_rank), + barrier(group) + { + if(!group.contains_rank(root_rank)) + { + throw std::runtime_error("[TensorBroadcaster::constructor]:\ + Incorrect root_rank is not part of the broadcast group"); + } + + auto const overhead_factor = 1.0; + auto& resource_manager = context.get_resource_manager(); + auto const segment_size = distribution::get_segment_size(tensor_infos, overhead_factor); + + resource_manager.make_segment_resources(segment_id, group, segment_size); + + // Barrier is required, to ensure all ranks have finished registering + // their segments to their communication partners + barrier.blocking_barrier(); + + for(auto const& info : tensor_infos) + { + auto const size_in_bytes = info.get_nelems() * getDataTypeSize(info.get_elem_type()); + buffers.emplace_back(resource_manager.get_buffer_of_size(segment_id, size_in_bytes)); + } + + auto const notifications = resource_manager.get_notification_range(segment_id, + collectives::broadcast::getNumberOfNotifications(group.get_size())); + bcast_op = std::make_unique(root, segment_size, segment_id, buffers.front().get_offset(), + notifications.first, queue_handler); + } + + void TensorBroadcaster::exec_broadcast(std::vector const& data_ptrs) + { + // copy data to segments + if (context.get_rank() == root) + { + for (std::size_t i = 0; i < data_ptrs.size(); ++i) + { + std::memcpy(buffers[i].get_ptr(), data_ptrs[i], buffers[i].get_size()); + } + } + + // start the operation + if (context.get_rank() == root) + { + bcast_op->signal(); + } + // execute broadcast + while(bcast_op->operator()() != 0); + + // copy results back to buffers + if (context.get_rank() != root) + { + for (std::size_t i = 0; i < data_ptrs.size(); ++i) + { + std::memcpy(data_ptrs[i], buffers[i].get_ptr(), buffers[i].get_size()); + } + } + + // finalize operation + barrier.blocking_barrier(); + } +} + diff --git a/src/gpi_comm_lib/TensorBroadcaster.hpp b/src/gpi_comm_lib/TensorBroadcaster.hpp new file mode 100644 index 00000000..c72440ce --- /dev/null +++ b/src/gpi_comm_lib/TensorBroadcaster.hpp @@ -0,0 +1,33 @@ +#pragma once + +#include "collectives/barrier/GPIBarrier.hpp" +#include "collectives/TensorInfo.hpp" +#include "gpi/Context.hpp" +#include "gpi/Group.hpp" +#include "gpi/SegmentBuffer.hpp" +#include "broadcast.h" + +#include +#include + +namespace tarantella +{ + + class TensorBroadcaster + { + public: + TensorBroadcaster(GPI::Context&, GPI::SegmentID, GPI::Group const&, + std::vector const&, GPI::Rank root_rank); + void exec_broadcast(std::vector const&); + + private: + GPI::Context& context; + GPI::Group const group; + collectives::queues queue_handler; // FIXME: use GPI::ResourcesManager + GPI::Rank root; + collectives::Barrier::GPIBarrier barrier; + + std::vector buffers; + std::unique_ptr bcast_op; + }; +} diff --git a/src/gpi_comm_lib/collectives/BufferElementType.cpp b/src/gpi_comm_lib/collectives/BufferElementType.cpp new file mode 100644 index 00000000..ac0b457e --- /dev/null +++ b/src/gpi_comm_lib/collectives/BufferElementType.cpp @@ -0,0 +1,26 @@ +#include "BufferElementType.hpp" + +#include + +namespace tarantella +{ + namespace collectives + { + std::size_t getDataTypeSize(const BufferElementType d) + { + std::unordered_map const sizes + { + {BufferElementType::FLOAT, sizeof(float)}, + {BufferElementType::DOUBLE, sizeof(double)}, + {BufferElementType::INT16, sizeof(int16_t)}, + {BufferElementType::INT32, sizeof(int32_t)} + }; + return sizes.at(d); + } + + std::ostream &operator<<(std::ostream& os, BufferElementType const& elem_type) + { + return os << static_cast(elem_type); + } + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/collectives/BufferElementType.hpp b/src/gpi_comm_lib/collectives/BufferElementType.hpp new file mode 100644 index 00000000..846dda2c --- /dev/null +++ b/src/gpi_comm_lib/collectives/BufferElementType.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include +#include + +namespace tarantella +{ + namespace collectives + { + enum class BufferElementType + { + FLOAT, + DOUBLE, + INT16, + INT32 + }; + + std::size_t getDataTypeSize(const BufferElementType d); + std::ostream &operator<<(std::ostream& os, BufferElementType const& elem_type); + } +} diff --git a/src/gpi_comm_lib/collectives/CMakeLists.txt b/src/gpi_comm_lib/collectives/CMakeLists.txt new file mode 100644 index 00000000..9ed5643d --- /dev/null +++ b/src/gpi_comm_lib/collectives/CMakeLists.txt @@ -0,0 +1,41 @@ + +set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) + +set(COLLECTIVES_SRC_DIR ${SRC_DIR}/gpi_comm_lib/collectives) +set(libSources + ${COLLECTIVES_SRC_DIR}/lib/allreduceButterfly.cpp + ${COLLECTIVES_SRC_DIR}/lib/allreduceButterflyDoubleBuffer.cpp + ${COLLECTIVES_SRC_DIR}/lib/broadcast.cpp + ${COLLECTIVES_SRC_DIR}/lib/counter.cpp + ${COLLECTIVES_SRC_DIR}/lib/mailBoxGaspi.cpp + ${COLLECTIVES_SRC_DIR}/lib/mailBoxLocal.cpp + ${COLLECTIVES_SRC_DIR}/lib/queues.cpp + ${COLLECTIVES_SRC_DIR}/lib/reduce.cpp + ${COLLECTIVES_SRC_DIR}/lib/writer.cpp + ${COLLECTIVES_SRC_DIR}/allreduce/RecursiveHalving.cpp + ${COLLECTIVES_SRC_DIR}/allreduce/RecursiveHalvingDoubleBuffer.cpp + ${COLLECTIVES_SRC_DIR}/allreduce/utils.cpp + ${COLLECTIVES_SRC_DIR}/barrier/GPIBarrier.cpp + ${COLLECTIVES_SRC_DIR}/BufferElementType.cpp + ${COLLECTIVES_SRC_DIR}/FusedTensorInfo.cpp + ${COLLECTIVES_SRC_DIR}/TensorInfo.cpp +) + +extended_add_library(NAME collectives + NAMESPACE tnt + TYPE SHARED + SOURCES + ${libSources} + LIBRARIES + optimized GPI2::GPI2 + debug GPI2::GPI2dbg + tnt::gpiresources + INCLUDE_DIRECTORIES + ${COLLECTIVES_SRC_DIR}/lib/ + COMPILE_OPTIONS + -Wno-unused-private-field + INSTALL + INSTALL_DESTINATION + ${INSTALL_LIB_DIR} + POSITION_INDEPENDENT) + diff --git a/src/gpi_comm_lib/collectives/FusedTensorInfo.cpp b/src/gpi_comm_lib/collectives/FusedTensorInfo.cpp new file mode 100644 index 00000000..f0ceed6b --- /dev/null +++ b/src/gpi_comm_lib/collectives/FusedTensorInfo.cpp @@ -0,0 +1,185 @@ +#include "FusedTensorInfo.hpp" + +namespace tarantella +{ + namespace collectives + { + void FusedTensorInfo::initialise_from_tensor_info(TensorInfo const& tensor_info) + { + local_offset_bytes.clear(); + local_size_bytes.clear(); + + id = tensor_info.get_id(); + nelems = tensor_info.get_nelems(); + elem_type = tensor_info.get_elem_type(); + elem_size = getDataTypeSize(elem_type); + size_bytes = nelems * elem_size; + num_tensors = 1UL; + tensor_ids.push_back(id); + local_offset_bytes[id] = 0UL; + local_size_bytes[id] = size_bytes; + } + + FusedTensorInfo::FusedTensorInfo() + : id(), + nelems(), + elem_type(), + elem_size(), + size_bytes(), + num_tensors(), + tensor_ids(), + local_offset_bytes(), + local_size_bytes() + { } + + FusedTensorInfo::FusedTensorInfo(TensorInfo const& tensor_info) + : FusedTensorInfo() + { + initialise_from_tensor_info(tensor_info); + } + + FusedTensorInfo& FusedTensorInfo::operator=(TensorInfo const& tensor_info) + { + initialise_from_tensor_info(tensor_info); + return *this; + } + + bool FusedTensorInfo::operator==(FusedTensorInfo const& other) const + { + return ( this->id == other.id && + this->nelems == other.nelems && + this->elem_type == other.elem_type && + this->num_tensors == other.num_tensors && + this->local_offset_bytes == other.local_offset_bytes && + this->local_size_bytes == other.local_size_bytes ); + + } + + FusedID FusedTensorInfo::get_id() const + { + return id; + } + + std::size_t FusedTensorInfo::get_nelems() const + { + return nelems; + } + + BufferElementType FusedTensorInfo::get_elem_type() const + { + return elem_type; + } + + std::size_t FusedTensorInfo::get_size_bytes() const + { + return size_bytes; + } + + std::size_t FusedTensorInfo::get_num_tensors() const + { + return num_tensors; + } + + std::vector FusedTensorInfo::get_tensor_ids() const + { + return tensor_ids; + } + + std::size_t FusedTensorInfo::get_local_offset_bytes(GradID const& grad_id) const + { + auto const it = local_offset_bytes.find(grad_id); + if (it == local_offset_bytes.end()) + { + throw std::logic_error("FusedTensorInfo::get_local_offset_bytes: FusedTensorInfo does not contain GradID"); + } + return it->second; + } + + std::size_t FusedTensorInfo::get_local_size_bytes(GradID const& grad_id) const + { + auto const it = local_size_bytes.find(grad_id); + if (it == local_size_bytes.end()) + { + throw std::logic_error("FusedTensorInfo::get_local_size_bytes: FusedTensorInfo does not contain GradID"); + } + return it->second; + } + + void FusedTensorInfo::add_tensor_info(TensorInfo const& tensor_info) + { + if (tensor_info.get_elem_type() != get_elem_type()) + { + throw std::logic_error("FusedTensorInfo::add_tensor_info: Tensors need to have same data type"); + } + + auto const grad_id = tensor_info.get_id(); + auto const grad_nelems = tensor_info.get_nelems(); + auto const grad_size_bytes = grad_nelems * elem_size; + auto const current_offset = size_bytes; + + nelems += grad_nelems; + size_bytes += grad_size_bytes; + num_tensors += 1UL; + + tensor_ids.push_back(grad_id); + local_offset_bytes[grad_id] = current_offset; + local_size_bytes[grad_id] = grad_size_bytes; + } + + TensorInfo FusedTensorInfo::to_tensor_info() const + { + return {get_id(), get_nelems(), get_elem_type()}; + } + + TensorFusor::TensorFusor() + : threshold_bytes(0UL) + { } + + TensorFusor::TensorFusor(std::size_t threshold) + : threshold_bytes(threshold) + { } + + void TensorFusor::fuse_tensor_infos_and_ids(std::vector const& tensor_infos, + IDMap& fused_ids, + InfoMap& fused_tensor_infos) + { + if (tensor_infos.size() == 1) + { + auto const tensor_info = tensor_infos.front(); + auto const id = tensor_info.get_id(); + fused_ids[id] = id; + fused_tensor_infos[id] = tensor_info; + } + + collectives::FusedTensorInfo fused_info(tensor_infos.front()); + auto tensor_id = tensor_infos.front().get_id(); + FusedID fused_id(tensor_id); + fused_ids[tensor_id] = fused_id; + + for (auto idx = 1UL; idx < tensor_infos.size(); ++idx) + { + tensor_id = tensor_infos[idx].get_id(); + + if (fused_info.get_size_bytes() < threshold_bytes) + { + fused_info.add_tensor_info(tensor_infos[idx]); + } + else + { + fused_tensor_infos[fused_id] = fused_info; + fused_id = tensor_id; + fused_info = tensor_infos[idx]; + } + + fused_ids[tensor_id] = fused_id; + + // Always add the last fused_tensor to the vector. + // Note, that it might still be smaller than `threshold_bytes`. + if (idx == tensor_infos.size() - 1) + { + fused_tensor_infos[fused_id] = fused_info; + } + } + } + } +} diff --git a/src/gpi_comm_lib/collectives/FusedTensorInfo.hpp b/src/gpi_comm_lib/collectives/FusedTensorInfo.hpp new file mode 100644 index 00000000..92925d2a --- /dev/null +++ b/src/gpi_comm_lib/collectives/FusedTensorInfo.hpp @@ -0,0 +1,69 @@ +#pragma once + +#include "BufferElementType.hpp" +#include "TensorInfo.hpp" +#include "Types.hpp" + +#include +#include +#include + +namespace tarantella +{ + namespace collectives + { + class FusedTensorInfo + { + public: + FusedTensorInfo(); + FusedTensorInfo(TensorInfo const&); + FusedTensorInfo& operator=(TensorInfo const&); + bool operator==(FusedTensorInfo const&) const; + + FusedID get_id() const; + std::size_t get_nelems() const; + BufferElementType get_elem_type() const; + std::size_t get_size_bytes() const; + + std::size_t get_num_tensors() const; + std::vector get_tensor_ids() const; + + std::size_t get_local_offset_bytes(GradID const&) const; + std::size_t get_local_size_bytes(GradID const&) const; + + void add_tensor_info(TensorInfo const&); + TensorInfo to_tensor_info() const; + + private: + FusedID id; + std::size_t nelems; + BufferElementType elem_type; + std::size_t elem_size; + std::size_t size_bytes; + std::size_t num_tensors; + + std::vector tensor_ids; + std::unordered_map local_offset_bytes; + std::unordered_map local_size_bytes; + + void initialise_from_tensor_info(TensorInfo const&); + }; + + class TensorFusor + { + public: + using IDMap = std::unordered_map; + using InfoMap = std::unordered_map; + + TensorFusor(); + TensorFusor(std::size_t threshold); + + void fuse_tensor_infos_and_ids(std::vector const&, + IDMap&, + InfoMap&); + + private: + std::size_t threshold_bytes; + }; + } +} diff --git a/src/gpi_comm_lib/collectives/TensorInfo.cpp b/src/gpi_comm_lib/collectives/TensorInfo.cpp new file mode 100644 index 00000000..b8b9d345 --- /dev/null +++ b/src/gpi_comm_lib/collectives/TensorInfo.cpp @@ -0,0 +1,26 @@ +#include "TensorInfo.hpp" + +namespace tarantella +{ + namespace collectives + { + TensorInfo::TensorInfo(GradID tensid, std::size_t nelems, BufferElementType elem_type) + : id(tensid), nelems(nelems), elem_type(elem_type) + {} + + GradID TensorInfo::get_id() const + { + return id; + } + + std::size_t TensorInfo::get_nelems() const + { + return nelems; + } + + BufferElementType TensorInfo::get_elem_type() const + { + return elem_type; + } + } +} diff --git a/src/gpi_comm_lib/collectives/TensorInfo.hpp b/src/gpi_comm_lib/collectives/TensorInfo.hpp new file mode 100644 index 00000000..374ce0dc --- /dev/null +++ b/src/gpi_comm_lib/collectives/TensorInfo.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include "BufferElementType.hpp" +#include "Types.hpp" + +#include + +namespace tarantella +{ + namespace collectives + { + class TensorInfo + { + public: + TensorInfo(GradID tensid, std::size_t nelems, BufferElementType elem_type); + + GradID get_id() const; + std::size_t get_nelems() const; + BufferElementType get_elem_type() const; + + private: + const GradID id; + const std::size_t nelems; + const BufferElementType elem_type; + }; + } +} diff --git a/src/gpi_comm_lib/collectives/Types.hpp b/src/gpi_comm_lib/collectives/Types.hpp new file mode 100644 index 00000000..2efe0caa --- /dev/null +++ b/src/gpi_comm_lib/collectives/Types.hpp @@ -0,0 +1,10 @@ +#pragma once + +namespace tarantella +{ + namespace collectives + { + using GradID = std::size_t; + using FusedID = std::size_t; + } +} diff --git a/src/gpi_comm_lib/collectives/allreduce/Operator.hpp b/src/gpi_comm_lib/collectives/allreduce/Operator.hpp new file mode 100644 index 00000000..e268721c --- /dev/null +++ b/src/gpi_comm_lib/collectives/allreduce/Operator.hpp @@ -0,0 +1,71 @@ +#pragma once + +#include "collectives/BufferElementType.hpp" +#include "gpi/NotificationManager.hpp" +#include "gpi/SegmentBuffer.hpp" + +#include +#include + +namespace tarantella +{ + namespace collectives + { + namespace Allreduce + { + // \note + // Interface for non-blocking, asynchronous Allreduce algorithms (not thread-safe) + class Operator + { + public: + class RequiredResource + { + public: + std::size_t buffer_size; + std::size_t num_notifications; + }; + using RequiredResourceList = std::vector; + using Resource = std::pair; + using ResourceList = std::vector; + + enum class ReductionOp + { + SUM, + AVERAGE + }; + + enum class OperatorState + { + NOT_STARTED, + RUNNING, + FINISHED + }; + + virtual ~Operator() = default; + + // Initiates the Allreduce operation (non-blocking) + // and sets is_running == TRUE + virtual void start() = 0; + + // Makes partial progress towards computing the Allreduce result + // and has to be called multiple times until the operation is completed, + // when is_finished == TRUE + // can be called independently of the state; + // it only tries to make progress if is_running == TRUE + virtual void trigger_communication_step() = 0; + + // Enables the Allreduce to be started again + // and sets is_running == FALSE and is_finished == FALSE + virtual void reset_for_reuse() = 0; + virtual bool is_running() const = 0; + + // If TRUE, results are available until reset_for_reuse() is called + virtual bool is_finished() const = 0; + + // TODO: void* -> SegmentBuffer + virtual void* get_input_ptr() const = 0; + virtual void* get_result_ptr() const = 0; + }; + } + } +} diff --git a/src/gpi_comm_lib/collectives/allreduce/RecursiveHalving.cpp b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalving.cpp new file mode 100644 index 00000000..be62d494 --- /dev/null +++ b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalving.cpp @@ -0,0 +1,98 @@ +#include "RecursiveHalving.hpp" + +#include "utils.hpp" + +namespace tarantella +{ + namespace collectives + { + namespace Allreduce + { + RecursiveHalving::RecursiveHalving(TensorInfo tensor_info, + ReductionOp reduction_op, + ResourceList const &resource_list, + queues &queues, + GPI::Group const &group) + : group(group), + state(OperatorState::NOT_STARTED), + allreduce(tensor_info.get_nelems(), to_allreduce_dataType(tensor_info.get_elem_type()), + to_allreduce_reductionType(reduction_op), + to_allreduce_segment_buffer(resource_list.at(0)), + to_allreduce_segment_buffer(resource_list.at(1)), + queues, group), + barrier(group) + {} + + void RecursiveHalving::start() + { + if (is_running()) + { + throw std::logic_error("[RecursiveHalving::start] Operation already started."); + } + if (is_finished()) + { + throw std::logic_error("[RecursiveHalving::start] Operation not reset after finish."); + } + allreduce.signal(); + state = OperatorState::RUNNING; + } + + void RecursiveHalving::trigger_communication_step() + { + if (is_running()) + { + auto const result = allreduce(); + if (result == 0) + { + barrier.blocking_barrier(); + state = OperatorState::FINISHED; + } + } + else + { + // do nothing before start() is called + } + } + + void RecursiveHalving::reset_for_reuse() + { + if (is_running()) + { + throw std::logic_error("[RecursiveHalving::reset] Cannot reset while running."); + } + state = OperatorState::NOT_STARTED; + } + + bool RecursiveHalving::is_running() const + { + return state == OperatorState::RUNNING; + } + + bool RecursiveHalving::is_finished() const + { + return state == OperatorState::FINISHED; + } + + Operator::RequiredResourceList RecursiveHalving::get_required_resources( + TensorInfo const& tensor_info, GPI::Group const& group) + { + auto const num_notifications = allreduceButterfly::getNumberOfNotifications(group.get_size()); + auto const num_elements_data_segment = tensor_info.get_nelems(); + auto const num_elements_temp_segment = static_cast( + allreduceButterfly::getNumberOfElementsSegmentCommunicate(tensor_info.get_nelems(), group.get_size())); + return {{num_elements_data_segment * getDataTypeSize(tensor_info.get_elem_type()), num_notifications}, + {num_elements_temp_segment * getDataTypeSize(tensor_info.get_elem_type()), num_notifications}}; + } + + void* RecursiveHalving::get_input_ptr() const + { + return allreduce.getReducePointer(); + } + + void* RecursiveHalving::get_result_ptr() const + { + return allreduce.getReducePointer(); + } + } + } +} diff --git a/src/gpi_comm_lib/collectives/allreduce/RecursiveHalving.hpp b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalving.hpp new file mode 100644 index 00000000..1687cf20 --- /dev/null +++ b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalving.hpp @@ -0,0 +1,49 @@ +#pragma once + +#include "Operator.hpp" +#include "allreduceButterfly.h" +#include "collectives/barrier/GPIBarrier.hpp" +#include "collectives/TensorInfo.hpp" +#include "gpi/Group.hpp" +#include "gpi/NotificationManager.hpp" +#include "gpi/SegmentBuffer.hpp" + +namespace tarantella +{ + namespace collectives + { + namespace Allreduce + { + class RecursiveHalving : public Operator + { + public: + RecursiveHalving(TensorInfo, + ReductionOp, + ResourceList const&, + queues&, + GPI::Group const&); + RecursiveHalving(const RecursiveHalving&) = delete; + RecursiveHalving& operator=(const RecursiveHalving&) = delete; + ~RecursiveHalving() = default; + + void start() override; + void trigger_communication_step() override; + + void reset_for_reuse() override; + bool is_running() const override; + bool is_finished() const override; + + void* get_input_ptr() const override; + void* get_result_ptr() const override; + + static RequiredResourceList get_required_resources(TensorInfo const&, GPI::Group const&); + + private: + GPI::Group const& group; + std::atomic state; + allreduceButterfly allreduce; + Barrier::GPIBarrier barrier; + }; + } + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/collectives/allreduce/RecursiveHalvingDoubleBuffer.cpp b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalvingDoubleBuffer.cpp new file mode 100644 index 00000000..0a2f842a --- /dev/null +++ b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalvingDoubleBuffer.cpp @@ -0,0 +1,98 @@ +#include "RecursiveHalvingDoubleBuffer.hpp" + +#include "gpi/gaspiCheckReturn.hpp" +#include "utils.hpp" + +namespace tarantella +{ + namespace collectives + { + namespace Allreduce + { + RecursiveHalvingDoubleBuffer::RecursiveHalvingDoubleBuffer(TensorInfo tensor_info, + ReductionOp reduction_op, + ResourceList const& resource_list, + queues& queues, + GPI::Group const& group) + : state(OperatorState::NOT_STARTED), + allreduce(tensor_info.get_nelems(), + to_allreduce_dataType(tensor_info.get_elem_type()), + to_allreduce_reductionType(reduction_op), + to_allreduce_segment_buffer(resource_list.at(0)), + to_allreduce_segment_buffer(resource_list.at(1)), + to_allreduce_segment_buffer(resource_list.at(2)), + queues, group) + { } + + void RecursiveHalvingDoubleBuffer::start() + { + if (is_running()) + { + throw std::logic_error("[RecursiveHalvingDoubleBuffer::start] Operation already started."); + } + if (is_finished()) + { + throw std::logic_error("[RecursiveHalvingDoubleBuffer::start] Operation not reset after finish."); + } + allreduce.signal(); + state = OperatorState::RUNNING; + } + + void RecursiveHalvingDoubleBuffer::trigger_communication_step() + { + if (is_running()) + { + auto const result = allreduce(); + if (result == 0) + { + state = OperatorState::FINISHED; + } + } + } + + void RecursiveHalvingDoubleBuffer::reset_for_reuse() + { + if (is_running()) + { + throw std::logic_error("[RecursiveHalvingDoubleBuffer::reset] Cannot reset while running."); + } + state = OperatorState::NOT_STARTED; + } + + bool RecursiveHalvingDoubleBuffer::is_running() const + { + return state == OperatorState::RUNNING; + } + + bool RecursiveHalvingDoubleBuffer::is_finished() const + { + return state == OperatorState::FINISHED; + } + + Operator::RequiredResourceList RecursiveHalvingDoubleBuffer::get_required_resources( + TensorInfo const& tensor_info, GPI::Group const& group) + { + auto const num_notifications = allreduceButterflyDoubleBuffer::getNumberOfNotifications(group.get_size()); + + auto const num_elements_data_segment = tensor_info.get_nelems(); + auto const num_elements_temp_segment = static_cast( + allreduceButterflyDoubleBuffer::getNumberOfElementsSegmentCommunicate( + tensor_info.get_nelems(), group.get_size())); + + return {{num_elements_data_segment * getDataTypeSize(tensor_info.get_elem_type()), num_notifications}, + {num_elements_data_segment * getDataTypeSize(tensor_info.get_elem_type()), num_notifications}, + {num_elements_temp_segment * getDataTypeSize(tensor_info.get_elem_type()), num_notifications}}; + } + + void* RecursiveHalvingDoubleBuffer::get_input_ptr() const + { + return allreduce.getActiveReducePointer(); + } + + void* RecursiveHalvingDoubleBuffer::get_result_ptr() const + { + return allreduce.getResultsPointer(); + } + } + } +} diff --git a/src/gpi_comm_lib/collectives/allreduce/RecursiveHalvingDoubleBuffer.hpp b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalvingDoubleBuffer.hpp new file mode 100644 index 00000000..653a891f --- /dev/null +++ b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalvingDoubleBuffer.hpp @@ -0,0 +1,44 @@ +#pragma once + +#include "Operator.hpp" +#include "allreduceButterflyDoubleBuffer.h" +#include "collectives/TensorInfo.hpp" +#include "gpi/Group.hpp" + +namespace tarantella +{ + namespace collectives + { + namespace Allreduce + { + class RecursiveHalvingDoubleBuffer : public Operator + { + public: + RecursiveHalvingDoubleBuffer(TensorInfo, + ReductionOp, + ResourceList const&, + queues&, + GPI::Group const&); + RecursiveHalvingDoubleBuffer(const RecursiveHalvingDoubleBuffer&) = delete; + RecursiveHalvingDoubleBuffer& operator=(const RecursiveHalvingDoubleBuffer&) = delete; + ~RecursiveHalvingDoubleBuffer() = default; + + void start() override; + void trigger_communication_step() override; + + void reset_for_reuse() override; + bool is_running() const override; + bool is_finished() const override; + + virtual void* get_input_ptr() const override; + virtual void* get_result_ptr() const override; + + static RequiredResourceList get_required_resources(TensorInfo const&, GPI::Group const& group); + + private: + std::atomic state; + allreduceButterflyDoubleBuffer allreduce; + }; + } + } +} diff --git a/src/gpi_comm_lib/collectives/allreduce/utils.cpp b/src/gpi_comm_lib/collectives/allreduce/utils.cpp new file mode 100644 index 00000000..da5f5b3b --- /dev/null +++ b/src/gpi_comm_lib/collectives/allreduce/utils.cpp @@ -0,0 +1,39 @@ +#include "utils.hpp" + +namespace tarantella +{ + namespace collectives + { + namespace Allreduce + { + allreduce::dataType to_allreduce_dataType(const BufferElementType type) + { + std::unordered_map const types{ + {BufferElementType::FLOAT, allreduce::FLOAT}, + {BufferElementType::DOUBLE, allreduce::DOUBLE}, + {BufferElementType::INT16, allreduce::INT16}, + {BufferElementType::INT32, allreduce::INT32}, + }; + return types.at(type); + } + + allreduce::reductionType to_allreduce_reductionType(const Operator::ReductionOp op) + { + std::unordered_map const reduction_ops{ + {Operator::ReductionOp::SUM, allreduce::SUM}, + {Operator::ReductionOp::AVERAGE, allreduce::AVERAGE}, + }; + return reduction_ops.at(op); + } + + allreduceButterfly::segmentBuffer to_allreduce_segment_buffer(Operator::Resource const& resource) + { + auto const [data_segment_buffer, notif_range] = resource; + allreduceButterfly::segmentBuffer buffer{data_segment_buffer.get_segment_id(), + data_segment_buffer.get_offset(), + static_cast(notif_range.first)}; + return buffer; + } + } + } +} diff --git a/src/gpi_comm_lib/collectives/allreduce/utils.hpp b/src/gpi_comm_lib/collectives/allreduce/utils.hpp new file mode 100644 index 00000000..462faa51 --- /dev/null +++ b/src/gpi_comm_lib/collectives/allreduce/utils.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include "allreduce.h" +#include "allreduceButterfly.h" +#include "collectives/BufferElementType.hpp" +#include "Operator.hpp" + +namespace tarantella +{ + namespace collectives + { + namespace Allreduce + { + allreduce::dataType to_allreduce_dataType(const BufferElementType type); + allreduce::reductionType to_allreduce_reductionType( + const Operator::ReductionOp op); + allreduceButterfly::segmentBuffer to_allreduce_segment_buffer( + Operator::Resource const &resource); + } + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/collectives/barrier/GPIBarrier.cpp b/src/gpi_comm_lib/collectives/barrier/GPIBarrier.cpp new file mode 100644 index 00000000..c954a6a9 --- /dev/null +++ b/src/gpi_comm_lib/collectives/barrier/GPIBarrier.cpp @@ -0,0 +1,38 @@ +#include "GPIBarrier.hpp" +#include "gpi/gaspiCheckReturn.hpp" + +#include + +namespace tarantella +{ + namespace collectives + { + namespace Barrier + { + GPIBarrier::GPIBarrier(GPI::Group const &group) + { + gaspi_rank_t comm_size; + GPI::gaspiCheckReturn(gaspi_proc_num(&comm_size), + "GPIBarrier::GPIBarrier : get number of ranks"); + if (group.get_size() != comm_size) + { + throw std::invalid_argument("GPIBarrier::GPIBarrier : can only be used with all ranks in \ + the default GPI communicator"); + } + } + + // TODO: implement for any GPI::Group + void GPIBarrier::blocking_barrier() + { + GPI::gaspiCheckReturn(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK), + "GPIBarrier::GPIBarrier : barrier failed"); + } + + void GPIBarrierAllRanks::blocking_barrier() + { + GPI::gaspiCheckReturn(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK), + "GPIBarrierAllRanks::GPIBarrierAllRanks : barrier failed"); + } + } + } +} diff --git a/src/gpi_comm_lib/collectives/barrier/GPIBarrier.hpp b/src/gpi_comm_lib/collectives/barrier/GPIBarrier.hpp new file mode 100644 index 00000000..3ac367a1 --- /dev/null +++ b/src/gpi_comm_lib/collectives/barrier/GPIBarrier.hpp @@ -0,0 +1,30 @@ +#pragma once + +#include "gpi/Group.hpp" +#include "Operator.hpp" + +namespace tarantella +{ + namespace collectives + { + namespace Barrier + { + // GPI Barrier implementation for GROUP_COMM_ALL + class GPIBarrier : public Operator + { + public: + + GPIBarrier(GPI::Group const & group); + void blocking_barrier(); + }; + + class GPIBarrierAllRanks : public Operator + { + public: + + GPIBarrierAllRanks() = default; + void blocking_barrier(); + }; + } + } +} diff --git a/src/gpi_comm_lib/collectives/barrier/Operator.hpp b/src/gpi_comm_lib/collectives/barrier/Operator.hpp new file mode 100644 index 00000000..81e863bb --- /dev/null +++ b/src/gpi_comm_lib/collectives/barrier/Operator.hpp @@ -0,0 +1,20 @@ +#pragma once + +namespace tarantella +{ + namespace collectives + { + namespace Barrier + { + // \note + // Interface for Barrier algorithms (not thread-safe) + class Operator + { + public: + virtual ~Operator() = default; + + virtual void blocking_barrier() = 0; + }; + } + } +} diff --git a/src/gpi_comm_lib/collectives/lib/allreduce.h b/src/gpi_comm_lib/collectives/lib/allreduce.h new file mode 100755 index 00000000..b9f12fe5 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/allreduce.h @@ -0,0 +1,27 @@ +#pragma once + +namespace tarantella +{ + namespace collectives + { + class allreduce { + public: + enum reductionType { + SUM = 0, + AVERAGE = 1, + NUM_RED = 2 + }; + enum dataType { + FLOAT = 0, + DOUBLE = 1, + INT16 = 2, + INT32 = 3, + NUM_TYPE = 4 + }; + + virtual int operator()() = 0; + virtual void signal() = 0; + virtual ~allreduce() {} + }; + } +} diff --git a/src/gpi_comm_lib/collectives/lib/allreduceButterfly.cpp b/src/gpi_comm_lib/collectives/lib/allreduceButterfly.cpp new file mode 100755 index 00000000..26fb2b58 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/allreduceButterfly.cpp @@ -0,0 +1,418 @@ +#include "allreduceButterfly.h" +#include "gpi/gaspiCheckReturn.hpp" +#include "mailBoxGaspi.h" +#include "gpi/Group.hpp" + +#include +#include +#include + +namespace tarantella +{ + namespace collectives + { + using tarantella::GPI::gaspiCheckReturn; + + nestedRingParameter::nestedRingParameter(const rankIndexType numRanks_, + const rankIndexType rank_) : + numRanks(numRanks_), + rank(rank_), + ringSizes(getRingSizes(numRanks)), + strides(getStrides(ringSizes)), + ringIndices(getRingIndices(ringSizes, rank_)) {} + + inline nestedRingParameter::ringSizesType nestedRingParameter::getRingSizes( + rankIndexType numRanks) { + ringSizesType s; + + unsigned long limit = std::sqrt(numRanks) + 2; + + for (unsigned long factor=2; factor < limit; factor++) { + while ((numRanks % factor) == 0) { + s.push_back(factor); + numRanks /= factor; + } + } + + if (numRanks > 1) { + s.push_back(numRanks); + } + + return s; + } + + inline nestedRingParameter::stridesType nestedRingParameter::getStrides( + const ringSizesType& ringSizes) { + const long numLevels = ringSizes.size(); + stridesType s(ringSizes.size()); + unsigned long factor = 1; + for (long level=numLevels - 1; level >= 0; level--) { + s[level] = factor; + factor *= ringSizes[level]; + } + + return s; + } + + inline nestedRingParameter::ringIndicesType + nestedRingParameter::getRingIndices(const ringSizesType& ringSizes, + const rankIndexType rank) { + ringIndicesType indices; + + rankIndexType product = 1; + for (unsigned long i=0; i < ringSizes.size(); i++) { + indices.push_back((rank / product) % ringSizes[i]); + product *= ringSizes[i]; + } + + return indices; + } + + nestedRingParameter::rankIndexType + nestedRingParameter::getNumberOfRings() const{ + return ringSizes.size(); + } + + nestedRingParameter::rankIndexType nestedRingParameter::getRingLength( + const levelType level) const { + return ringSizes[level]; + } + + nestedRingParameter::rankIndexType nestedRingParameter::getLocalRankInRing( + const levelType level) const { + return ringIndices[level]; + } + + nestedRingParameter::rankIndexType + nestedRingParameter::getGlobalRankToWriteInRing( + const levelType level) const { + long numLevels = ringSizes.size(); + rankIndexType r = 0; + for (long i=numLevels - 1; i > long(level); i--) { + r = ringIndices[i] + ringSizes[i] * r; + } + const rankIndexType next = (ringIndices[level] + 1) % ringSizes[level]; + r = next + ringSizes[level] * r; + for (long i=long(level) - 1; i >= 0; i--) { + r = ringIndices[i] + ringSizes[i] * r; + } + return r; + } + + nestedRingParameter::bufferIndexType nestedRingParameter::getBufferLength( + const levelType level) const { + return strides[level]; + } + + nestedRingParameter::bufferIndexType nestedRingParameter::getBufferStart( + const levelType level, + const bufferIndexType buffer) const { + // we assume that each global rank aggregates on each level the buffer + // that matches the local ring id. This buffer is + // I.E. getBufferStart(level, getRankInRing(level)) + // -> getBufferStart(level, getRankInRing(level)) + getBufferLength(level) + + bufferIndexType s = 0; + for (unsigned long i=0; i < level; i++) { + s += ringIndices[i] * strides[i]; + } + s += buffer * strides[level]; + + return s; + } + + allreduceButterfly::allreduceButterfly( + const long len, + const dataType data, + const reductionType reduction, + const segmentBuffer locationReduce_, + const segmentBuffer locationCommunicate_, + queues& queues_, + GPI::Group const& group_ + ) + : totalLength(len), + dataElement(data), + group(group_), + numRanks(getNumRanks()), + rank(getRank()), + locationReduce(locationReduce_), + locationReducePointer(getSegmentPointer(locationReduce_.segment) + + locationReduce_.offset), + locationCommunicate(locationCommunicate_), + topology(numRanks, getRankIndex(rank, getRanks())), + sender(queues_), + reducer(getReduce(data, reduction)), + status(2 * getNumberOfNotifications(numRanks) + 1){ + + std::vector ranks = getRanks(); + + setReduceScatter(ranks); + setAllToAll(ranks); + } + + long allreduceButterfly::getNumRanks() const { + return group.get_size(); + } + + long allreduceButterfly::getRank() { + gaspi_rank_t rank; + gaspiCheckReturn(gaspi_proc_rank(&rank), + "gaspi_proc_rank failed with "); + return rank; + } + + std::vector allreduceButterfly::getRanks() const { + return group.get_ranks(); + } + + unsigned long allreduceButterfly::getRankIndex( + gaspi_rank_t rank, + const std::vector& ranks) { + unsigned long rankIndex; + if (find(ranks.begin(), ranks.end(), rank) == ranks.end()) { + throw std::runtime_error("rank not member of group"); + } else { + rankIndex = find(ranks.begin(), ranks.end(), rank) + - ranks.begin(); + } + return rankIndex; + } + + void allreduceButterfly::setReduceScatter( + const std::vector& ranks) { + gaspi_notification_id_t nextNotification + = locationCommunicate.firstNotification; + gaspi_offset_t nextLocalCommunicationBufferByte = 0; + const char* const reductionSourceBasePointer = + getSegmentPointer(locationCommunicate.segment) + + locationCommunicate.offset; + char* const reductionDestinationBasePointer = + getSegmentPointer(locationReduce.segment) + + locationReduce.offset; + + receiver.push_back(&trigger); + jobs.push_back(jobType()); + + for (unsigned long ring=0; ring < topology.getNumberOfRings(); ring++) { + + const rankIndexType ringLength = topology.getRingLength(ring); + const rankIndexType ringRank = topology.getLocalRankInRing(ring); + const bufferIndexType bufferLengthIndex = topology.getBufferLength(ring); + const gaspi_rank_t outgoingGlobalRank = + ranks[topology.getGlobalRankToWriteInRing(ring)]; + gaspi_offset_t nextRemoteCommunicationBufferByte + = nextLocalCommunicationBufferByte; + + + for (unsigned long loop=0; loop < ringLength - 1; loop++) { + const unsigned long currentJob = receiver.size() - 1; + receiver.push_back( + new mailBoxGaspi(locationCommunicate.segment, nextNotification)); + jobs.push_back(jobType()); + + const bufferIndexType sendBufferID = + (ringRank + ringLength - loop - 1) % ringLength; + const bufferIndexType sendStartIndex = + topology.getBufferStart(ring, sendBufferID); + const gaspi_offset_t sendStartByte = + chunkIndexToByte(sendStartIndex); + const long sendLengthByte = + chunkIndexToByte(sendStartIndex + bufferLengthIndex) + - sendStartByte; + const writer::transferParameters transfer( + true, + outgoingGlobalRank, + locationReduce.segment, + locationReduce.offset + sendStartByte, + locationCommunicate.segment, + locationCommunicate.offset + nextRemoteCommunicationBufferByte, + sendLengthByte, + nextNotification); + jobs[currentJob].second = transfer; + + const bufferIndexType receiveBufferID = + (ringRank + ringLength - loop - 2) % ringLength; + const bufferIndexType receiveStartIndex = + topology.getBufferStart(ring, receiveBufferID); + const gaspi_offset_t receiveStartByte = + chunkIndexToByte(receiveStartIndex); + const long receiveLengthByte = + chunkIndexToByte(receiveStartIndex + bufferLengthIndex) + - receiveStartByte; + const reduce::task copy( + reductionSourceBasePointer + nextLocalCommunicationBufferByte, + reductionDestinationBasePointer + receiveStartByte, + receiveLengthByte / getDataTypeSize(dataElement)); + jobs[currentJob + 1].first = copy; + + nextNotification++; + nextRemoteCommunicationBufferByte += sendLengthByte; + nextLocalCommunicationBufferByte += receiveLengthByte; + } + } + + jobs.back().first.scaling = numRanks; + } + + inline char* allreduceButterfly::getSegmentPointer( + const gaspi_segment_id_t segment) { + gaspi_pointer_t p; + gaspiCheckReturn(gaspi_segment_ptr(segment, &p), + "failed getting segment pointer"); + return (char*) p; + } + + inline unsigned long allreduceButterfly::chunkIndexToByte( + const long chunkIndex) const { + return ((totalLength * chunkIndex + numRanks - 1) / numRanks) + * getDataTypeSize(dataElement); + } + + void allreduceButterfly::setAllToAll( + const std::vector& ranks) { + gaspi_notification_id_t nextNotification = locationReduce.firstNotification; + + for (long ring=topology.getNumberOfRings() - 1; ring >=0 ; ring--) { + + const rankIndexType ringLength = topology.getRingLength(ring); + const rankIndexType ringRank = topology.getLocalRankInRing(ring); + const bufferIndexType bufferLengthIndex = topology.getBufferLength(ring); + const gaspi_rank_t outgoingGlobalRank = + ranks[topology.getGlobalRankToWriteInRing(ring)]; + + for (unsigned long loop=0; loop < ringLength - 1; loop++) { + const unsigned long currentJob = receiver.size() - 1; + receiver.push_back( + new mailBoxGaspi(locationReduce.segment, nextNotification)); + jobs.push_back(jobType()); + + const bufferIndexType transferBufferID = + (ringRank + ringLength - loop) % ringLength; + const bufferIndexType transferStartIndex = + topology.getBufferStart(ring, transferBufferID); + const gaspi_offset_t transferStartByte = + chunkIndexToByte(transferStartIndex); + const long transferLengthByte = + chunkIndexToByte(transferStartIndex + bufferLengthIndex) + - transferStartByte; + + const writer::transferParameters transfer( + true, + outgoingGlobalRank, + locationReduce.segment, + locationReduce.offset + transferStartByte, + locationReduce.segment, + locationReduce.offset + transferStartByte, + transferLengthByte, + nextNotification); + jobs[currentJob].second = transfer; + + nextNotification++; + } + } + } + + allreduceButterfly::~allreduceButterfly() { + delete reducer; + for (unsigned long i=1; i < receiver.size(); i++) { + delete receiver[i]; + } + } + + int allreduceButterfly::operator()() { + const unsigned long phase = status.get(); + // could be a problem if we overtake one iteration? + if (!receiver[phase]->gotNotification()) { + return -1; + } + + reducer->operator()(jobs[phase].first); + // hier schon freigeben? + sender(jobs[phase].second); + + return (status.increment() == 0) ? 0 : -1; + } + + void allreduceButterfly::signal() { + trigger.notify(); + } + + gaspi_pointer_t allreduceButterfly::getReducePointer() const { + return locationReducePointer; + } + + long allreduceButterfly::getNumberOfElementsSegmentCommunicate( + const long len, + const long numRanks) { + return ((len + numRanks - 1) / numRanks) * (numRanks - 1); + } + + unsigned long allreduceButterfly::getNumberOfNotifications( + const long numRanks) { + const nestedRingParameter topology(numRanks); + + gaspi_notification_id_t notifications = 0; + for (unsigned long i=0; i < topology.getNumberOfRings(); i++) { + notifications += topology.getRingLength(i) - 1; + } + + return notifications; + } + + std::ostream& allreduceButterfly::report(std::ostream& s) const { + char* pr = getSegmentPointer(locationReduce.segment); + char* pc = getSegmentPointer(locationCommunicate.segment); + const unsigned long phase = status.get(); + s << "total length: " << totalLength << std::endl + << "dataElement: " << dataElement << std::endl + << "numRanks: " << numRanks << std::endl + << "rank: " << rank << std::endl + << "topology.getNumberOfRings" << topology.getNumberOfRings() << std::endl + << "getNumberOfNotifications(): " + << getNumberOfNotifications(numRanks) << std::endl + << "segmentReduce: " << long(locationReduce.segment) << std::endl + << "offsetReduce: " << locationReduce.offset << std::endl + << "firstNotificationReduce: " << locationReduce.firstNotification + << std::endl + << "segmentCommunicate: " << long(locationCommunicate.segment) + << std::endl + << "offsetCommunicate: " << locationCommunicate.offset << std::endl + << "firstNotificationCommunicate: " + << locationCommunicate.firstNotification << std::endl + << "pointer segment reduce : " + << (void*)getSegmentPointer(locationReduce.segment) << std::endl + << "pointer segment communicate: " + << (void*)getSegmentPointer(locationCommunicate.segment) << std::endl + << "phase " << phase << std::endl; + for (unsigned long i=0; i < jobs.size(); i++) { + s << ".........................." << std::endl; + s << "phase " << i << std::endl; + if (i==0) { + s << "Receiver: " << "user" << std::endl; + } else { + mailBoxGaspi* m = (mailBoxGaspi*) receiver[i]; + s << "Receiver: segment " << long(m->getSegmentID()) + << " notification ID " << m->getMailID() << std::endl; + } + + if (jobs[i].first.len > 0) { + s << "Reduce : src " << jobs[i].first.source + << " (" << (char*)jobs[i].first.source - pc << ")" + << " dst " << jobs[i].first.destination + << " (" << (char*)jobs[i].first.destination - pr << ")" + << " ele " << jobs[i].first.len + << " (" << jobs[i].first.len * getDataTypeSize(dataElement) << ")" + << std::endl; + } else { + s << "Reduce : idle" << std::endl; + } + + s << "Send : "; + jobs[i].second.report(s) << std::endl; + } + s << ".........................." << std::endl; + + return s; + } + } +} diff --git a/src/gpi_comm_lib/collectives/lib/allreduceButterfly.h b/src/gpi_comm_lib/collectives/lib/allreduceButterfly.h new file mode 100644 index 00000000..194bac0a --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/allreduceButterfly.h @@ -0,0 +1,118 @@ +#pragma once + +#include "allreduce.h" +#include "counter.h" +#include "gpi/Group.hpp" +#include "mailBox.h" +#include "mailBoxLocal.h" +#include "queues.h" +#include "reduce.h" +#include "writer.h" + +#include + +#include + +namespace tarantella +{ + namespace collectives + { + class nestedRingParameter { + public: + typedef unsigned long rankIndexType; + typedef unsigned long levelType; + typedef unsigned long bufferIndexType; + + nestedRingParameter(const rankIndexType numRanks_, + const rankIndexType rank_=0); + + rankIndexType getNumberOfRings() const; + rankIndexType getRingLength(const levelType level) const; + rankIndexType getLocalRankInRing(const levelType level) const; + rankIndexType getGlobalRankToWriteInRing(const levelType level) const; + bufferIndexType getBufferLength(const levelType level) const; + bufferIndexType getBufferStart(const levelType level, + const bufferIndexType buffer) const; + + private: + + typedef std::vector ringIndicesType; + typedef std::vector ringSizesType; + typedef std::vector stridesType; + + static inline ringSizesType getRingSizes(rankIndexType numRanks); + static inline stridesType getStrides(const ringSizesType& ringSizes); + static inline ringIndicesType getRingIndices(const ringSizesType& ringSizes, + const rankIndexType rank); + + const rankIndexType numRanks; + const rankIndexType rank; + const ringSizesType ringSizes; + const stridesType strides; + const ringIndicesType ringIndices; + }; + + class allreduceButterfly : public allreduce { + public: + + struct segmentBuffer { + gaspi_segment_id_t segment; + gaspi_offset_t offset; + gaspi_notification_id_t firstNotification; + }; + + allreduceButterfly(const long len, + const dataType data, + const reductionType reduction, + const segmentBuffer segmentReduce, + const segmentBuffer segmentCommunicate, + queues& queues_, + GPI::Group const& group_); + ~allreduceButterfly(); + int operator()(); + void signal(); + + gaspi_pointer_t getReducePointer() const; + static long getNumberOfElementsSegmentCommunicate(const long len, + const long numRanks); + static unsigned long getNumberOfNotifications(const long numRanks); + std::ostream& report(std::ostream& s) const; + + private: + + typedef nestedRingParameter::rankIndexType rankIndexType; + typedef nestedRingParameter::bufferIndexType bufferIndexType; + typedef std::pair jobType; + + inline long getNumRanks() const; + static inline long getRank(); + std::vector getRanks() const; + static inline rankIndexType getRankIndex( + gaspi_rank_t rank, + const std::vector& ranks); + void setReduceScatter(const std::vector& ranks); + inline static char* getSegmentPointer(const gaspi_segment_id_t segment); + inline unsigned long chunkIndexToByte(const long chunkIndex) const; + void setAllToAll(const std::vector& ranks); + + const long totalLength; + const dataType dataElement; + GPI::Group const group; + const long numRanks; + const gaspi_rank_t rank; + const segmentBuffer locationReduce; + const gaspi_pointer_t locationReducePointer; + const segmentBuffer locationCommunicate; + + const nestedRingParameter topology; + + mailBoxLocal trigger; + std::vector receiver; + std::vector jobs; + + writer sender; + reduce * reducer; + counter status; + }; + } +} diff --git a/src/gpi_comm_lib/collectives/lib/allreduceButterflyDoubleBuffer.cpp b/src/gpi_comm_lib/collectives/lib/allreduceButterflyDoubleBuffer.cpp new file mode 100755 index 00000000..484b5268 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/allreduceButterflyDoubleBuffer.cpp @@ -0,0 +1,91 @@ +#include "allreduceButterflyDoubleBuffer.h" + +namespace tarantella +{ + namespace collectives + { + allreduceButterflyDoubleBuffer::allreduceButterflyDoubleBuffer( + const long len, + const dataType data, + const reductionType reduction, + const allreduceButterfly::segmentBuffer segmentReduce0, + const allreduceButterfly::segmentBuffer segmentReduce1, + const allreduceButterfly::segmentBuffer segmentCommunicate, + queues& queues, + GPI::Group const& group) + : state(0), + reduceFirst(len, data, reduction, segmentReduce0, + segmentCommunicate, queues, group), + reduceSecond(len, data, reduction, segmentReduce1, + segmentCommunicate, queues, group) { + tableReduce[0] = &reduceFirst; + tableReduce[1] = &reduceSecond; + } + + int allreduceButterflyDoubleBuffer::operator()() { + const int result = getReduce()(); + + if (!result) { + flipReduce(); + } + + return result; + } + + inline allreduceButterfly& allreduceButterflyDoubleBuffer::getReduce() const { + return tableReduce[stateToIndex(state)][0]; + } + + inline long allreduceButterflyDoubleBuffer::stateToIndex(const long state) { + return state & 1l; + } + + inline void allreduceButterflyDoubleBuffer::flipReduce() { + __sync_fetch_and_add(&state, 1l); + } + + void allreduceButterflyDoubleBuffer::signal() { + getReduce().signal(); + } + + gaspi_pointer_t allreduceButterflyDoubleBuffer::getActiveReducePointer() const { + return getReduce().getReducePointer(); + } + + gaspi_pointer_t allreduceButterflyDoubleBuffer::getResultsPointer() const { + return getOtherReduce().getReducePointer(); + } + + inline const allreduceButterfly& + allreduceButterflyDoubleBuffer::getOtherReduce() const { + return tableReduce[invertIndex(stateToIndex(state))][0]; + } + + inline long allreduceButterflyDoubleBuffer::invertIndex(const long state) { + return state ^ 1l; + } + + long allreduceButterflyDoubleBuffer::getNumberOfElementsSegmentCommunicate( + const long len, + const long numRanks) { + return allreduceButterfly::getNumberOfElementsSegmentCommunicate(len, + numRanks); + } + + unsigned long allreduceButterflyDoubleBuffer::getNumberOfNotifications( + const long numRanks) { + return allreduceButterfly::getNumberOfNotifications(numRanks); + } + + std::ostream& allreduceButterflyDoubleBuffer::report(std::ostream& s) const { + s << "stateExecute: " << state << std::endl + << "***** reduceFirst *****" << std::endl; + reduceFirst.report(s); + s << "***** reduceSecond *****" << std::endl; + reduceSecond.report(s); + + return s; + } + } +} + \ No newline at end of file diff --git a/src/gpi_comm_lib/collectives/lib/allreduceButterflyDoubleBuffer.h b/src/gpi_comm_lib/collectives/lib/allreduceButterflyDoubleBuffer.h new file mode 100755 index 00000000..e4f503b7 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/allreduceButterflyDoubleBuffer.h @@ -0,0 +1,52 @@ +#pragma once + +#include "allreduceButterfly.h" +#include "gpi/Group.hpp" + +namespace tarantella +{ + namespace collectives + { + class allreduceButterflyDoubleBuffer : public allreduce { + public: + + allreduceButterflyDoubleBuffer( + const long len, + const dataType data, + const reductionType reduction, + const allreduceButterfly::segmentBuffer segmentReduce0, + const allreduceButterfly::segmentBuffer segmentReduce1, + const allreduceButterfly::segmentBuffer segmentCommunicate, + queues& queues, + GPI::Group const& group); + int operator()(); + void signal(); + + gaspi_pointer_t getActiveReducePointer() const; + gaspi_pointer_t getResultsPointer() const; + static long getNumberOfElementsSegmentCommunicate(const long len, + const long numRanks); + static unsigned long getNumberOfNotifications(const long numRanks); + std::ostream& report(std::ostream& s) const; + + private: + + inline allreduceButterfly& getReduce() const; + static inline long stateToIndex(const long state); + inline void flipReduce(); + inline const allreduceButterfly& getOtherReduce() const; + static inline long invertIndex(const long state); + + static const long CACHE_LINE_SIZE = 64; + + char pad0[CACHE_LINE_SIZE]; + volatile long state; + char pad1[CACHE_LINE_SIZE]; + + allreduceButterfly reduceFirst; + allreduceButterfly reduceSecond; + allreduceButterfly* tableReduce[2]; + }; + } +} + \ No newline at end of file diff --git a/src/gpi_comm_lib/collectives/lib/broadcast.cpp b/src/gpi_comm_lib/collectives/lib/broadcast.cpp new file mode 100755 index 00000000..c1b814e1 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/broadcast.cpp @@ -0,0 +1,197 @@ +#include "broadcast.h" +#include "gpi/gaspiCheckReturn.hpp" +#include "mailBoxGaspi.h" + +#include +#include + +namespace tarantella +{ + namespace collectives + { + using tarantella::GPI::gaspiCheckReturn; + + broadcast::broadcast( + const gaspi_rank_t master_, + const long len, + const gaspi_segment_id_t segment_, + const gaspi_offset_t offset_, + const gaspi_notification_id_t firstNotification_, + queues& queues_ ) + : totalLength(len), + group(GASPI_GROUP_ALL), + numRanks(getNumRanks()), + rank(getRank()), + masterRank(master_), + segment(segment_), + offset(offset_), + firstNotification(firstNotification_), + sender(queues_), + status((rank == masterRank) ? 1 : numRanks){ + + std::vector ranks(numRanks); + gaspiCheckReturn(gaspi_group_ranks(group, &ranks[0]), + "gaspi_group_ranks failed with"); + const unsigned long rankIndex = getRankIndex(rank, ranks); + + if (rank == masterRank) { + setMaster(rankIndex, ranks); + } else { + setWorker(rankIndex, ranks); + } + } + + long broadcast::getNumRanks() const { + gaspi_number_t size; + gaspiCheckReturn(gaspi_group_size(group, &size), + "gaspi_group_size failed with "); + return size; + } + + long broadcast::getRank() { + gaspi_rank_t rank; + gaspiCheckReturn(gaspi_proc_rank(&rank), + "gaspi_proc_rank failed with "); + return rank; + } + + long broadcast::getRankIndex(gaspi_rank_t rank, + const std::vector& ranks) { + unsigned long rankIndex; + if (find(ranks.begin(), ranks.end(), rank) == ranks.end()) { + throw std::runtime_error("rank not member of group"); + } else { + rankIndex = find(ranks.begin(), ranks.end(), rank) + - ranks.begin(); + } + return rankIndex; + } + + void broadcast::setMaster( + const unsigned long rankIndex, + const std::vector& ranks) { + const gaspi_rank_t partner = ranks[getPartnerIndex(rankIndex)]; + + receiver.push_back(&trigger); + + if (partner != rank) { + for (long c=0; c < numRanks; c++) { + writer::transferParameters job( + true, + partner, + segment, + offset + chunkIndexToByte(c), + segment, + offset + chunkIndexToByte(c), + chunkIndexToByte(c + 1) - chunkIndexToByte(c), + firstNotification + c); + jobs.push_back(job); + } + } + } + + inline unsigned long broadcast::getPartnerIndex( + const unsigned long rankIndex) const { + return (rankIndex + 1) % numRanks; + } + + void broadcast::setWorker( + const unsigned long rankIndex, + const std::vector& ranks) { + const gaspi_rank_t partner = ranks[getPartnerIndex(rankIndex)]; + + for (long c=0; c < numRanks; c++) { + receiver.push_back( + new mailBoxGaspi(segment, firstNotification + c)); + + if (partner == masterRank) { + jobs.push_back(writer::transferParameters()); + } else { + writer::transferParameters transfer( + true, + partner, + segment, + offset + chunkIndexToByte(c), + segment, + offset + chunkIndexToByte(c), + chunkIndexToByte(c + 1) - chunkIndexToByte(c), + firstNotification + c); + jobs.push_back(transfer); + } + } + } + + inline unsigned long broadcast::chunkIndexToByte( + const long chunkIndex) const { + return ((totalLength * chunkIndex + numRanks - 1) / numRanks); + } + + broadcast::~broadcast() { + if (rank != masterRank) { + for (unsigned long i=0; i < receiver.size(); i++) { + delete receiver[i]; + } + } + } + + int broadcast::operator()() { + const unsigned long phase = status.get(); + if (!receiver[phase]->gotNotification()) { + return -1; + } + + if (rank == masterRank) { + for (unsigned long i=0; i < jobs.size(); i++) { + sender(jobs[i]); + } + } else { + sender(jobs[phase]); + } + + return (status.increment() == 0) ? 0 : -1; + } + + void broadcast::signal() { + trigger.notify(); + } + + long broadcast::getNumberOfNotifications(const long numRanks) { + return (numRanks > 1) ? numRanks : 0; + } + + std::ostream& broadcast::report(std::ostream& s) const { + const unsigned long phase = status.get(); + s << "total length: " << totalLength << std::endl + << "numRanks: " << numRanks << std::endl + << "rank: " << rank << std::endl + << "masterRank: " << masterRank << std::endl + << "segment: " << long(segment) << std::endl + << "offset: " << offset << std::endl + << "firstNotification: " << firstNotification << std::endl + << std::endl + << "phase " << phase << std::endl; + for (unsigned long i=0; i < jobs.size(); i++) { + s << ".........................." << std::endl; + s << "phase " << i << std::endl; + if ((i==0) && (rank == masterRank)) { + s << "Receiver: " << "user" << std::endl; + } else { + if (i < receiver.size()) { + mailBoxGaspi* m = (mailBoxGaspi*) receiver[i]; + s << "Receiver: segment " << long(m->getSegmentID()) + << " notification ID " << m->getMailID() << std::endl; + } else { + s << "Receiver: idle" << std::endl; + } + } + + s << "Send : "; + jobs[i].report(s) << std::endl; + } + s << ".........................." << std::endl; + + return s; + } + } +} + \ No newline at end of file diff --git a/src/gpi_comm_lib/collectives/lib/broadcast.h b/src/gpi_comm_lib/collectives/lib/broadcast.h new file mode 100755 index 00000000..e59e0e89 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/broadcast.h @@ -0,0 +1,62 @@ +#pragma once + +#include "writer.h" +#include "mailBox.h" +#include "mailBoxLocal.h" +#include "counter.h" +#include "queues.h" + +#include +#include + +namespace tarantella +{ + namespace collectives + { + class broadcast { + public: + broadcast(const gaspi_rank_t master_, + const long len, + const gaspi_segment_id_t segment_, + const gaspi_offset_t offset_, + const gaspi_notification_id_t firstNotification_, + queues& queues_); + ~broadcast(); + int operator()(); + void signal(); + static long getNumberOfNotifications(const long numRanks); + std::ostream& report(std::ostream& s) const; + + private: + + long getNumRanks() const; + static long getRank(); + static long getRankIndex(gaspi_rank_t rank, + const std::vector& ranks); + void setMaster(const unsigned long rankIndex, + const std::vector& ranks); + inline unsigned long getPartnerIndex(const unsigned long rankIndex) const; + void setWorker(const unsigned long rankIndex, + const std::vector& ranks); + inline unsigned long chunkIndexToByte(const long chunkIndex) const; + inline static char* getSegmentPointer(const gaspi_segment_id_t segment); + + const long totalLength; + const gaspi_group_t group; + const long numRanks; + const gaspi_rank_t rank; + const gaspi_rank_t masterRank; + const gaspi_segment_id_t segment; + const gaspi_offset_t offset; + const gaspi_notification_id_t firstNotification; + + mailBoxLocal trigger; + std::vector receiver; + std::vector jobs; + + writer sender; + counter status; + }; + } +} + \ No newline at end of file diff --git a/src/gpi_comm_lib/collectives/lib/counter.cpp b/src/gpi_comm_lib/collectives/lib/counter.cpp new file mode 100755 index 00000000..f36ea582 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/counter.cpp @@ -0,0 +1,19 @@ +#include "counter.h" + +namespace tarantella +{ + namespace collectives + { + counter::counter(const unsigned long phasePeriod_) + : phasePeriod(phasePeriod_), + value(0) {} + + unsigned long counter::increment() { + return (++value) % phasePeriod; + } + + unsigned long counter::get() const { + return value % phasePeriod; + } + } +} diff --git a/src/gpi_comm_lib/collectives/lib/counter.h b/src/gpi_comm_lib/collectives/lib/counter.h new file mode 100755 index 00000000..5a630592 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/counter.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +namespace tarantella +{ + namespace collectives + { + class counter { + public: + counter(const unsigned long phasePeriod_ = 1); + unsigned long increment(); + unsigned long get() const; + private: + + const unsigned long phasePeriod; + std::atomic value; + }; + } +} + \ No newline at end of file diff --git a/src/gpi_comm_lib/collectives/lib/mailBox.h b/src/gpi_comm_lib/collectives/lib/mailBox.h new file mode 100755 index 00000000..a7d9a9d2 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/mailBox.h @@ -0,0 +1,14 @@ +#pragma once + +namespace tarantella +{ + namespace collectives + { + class mailBox + { + public: + virtual bool gotNotification() = 0; + virtual ~mailBox() = default; + }; + } +} diff --git a/src/gpi_comm_lib/collectives/lib/mailBoxGaspi.cpp b/src/gpi_comm_lib/collectives/lib/mailBoxGaspi.cpp new file mode 100755 index 00000000..3e2b3738 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/mailBoxGaspi.cpp @@ -0,0 +1,47 @@ +#include "mailBoxGaspi.h" +#include "gpi/gaspiCheckReturn.hpp" + +#include + +namespace tarantella +{ + namespace collectives + { + using tarantella::GPI::gaspiCheckReturn; + + mailBoxGaspi::mailBoxGaspi(const gaspi_segment_id_t segmentID_, + const gaspi_notification_id_t mailID_) + : segmentID(segmentID_), + mailID(mailID_) {} + + bool mailBoxGaspi::gotNotification() { + gaspi_notification_id_t event; + gaspi_return_t err = gaspi_notify_waitsome(segmentID, + mailID, + 1, + &event, + GASPI_TEST); + if (err == GASPI_TIMEOUT) + { + return false; + } + gaspiCheckReturn(err, "gaspi_notify_waitsome failed with "); + + assert(mailID == event); + gaspi_notification_t value; + gaspiCheckReturn(gaspi_notify_reset(segmentID, + event, + &value), + "gaspi_notify_reset failed with "); + return value != 0; + } + + gaspi_segment_id_t mailBoxGaspi::getSegmentID() const { + return segmentID; + } + + gaspi_notification_id_t mailBoxGaspi::getMailID() const { + return mailID; + } + } +} diff --git a/src/gpi_comm_lib/collectives/lib/mailBoxGaspi.h b/src/gpi_comm_lib/collectives/lib/mailBoxGaspi.h new file mode 100755 index 00000000..c33b6c86 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/mailBoxGaspi.h @@ -0,0 +1,26 @@ +#pragma once + +#include "mailBox.h" + +#include + +namespace tarantella +{ + namespace collectives + { + class mailBoxGaspi : public mailBox + { + public: + mailBoxGaspi(const gaspi_segment_id_t segmentID_, + const gaspi_notification_id_t mailID_); + bool gotNotification() override; + gaspi_segment_id_t getSegmentID() const; + gaspi_notification_id_t getMailID() const; + + private: + + const gaspi_segment_id_t segmentID; + const gaspi_notification_id_t mailID; + }; + } +} diff --git a/src/gpi_comm_lib/collectives/lib/mailBoxLocal.cpp b/src/gpi_comm_lib/collectives/lib/mailBoxLocal.cpp new file mode 100755 index 00000000..352f7545 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/mailBoxLocal.cpp @@ -0,0 +1,20 @@ +#include "mailBoxLocal.h" + +namespace tarantella +{ + namespace collectives + { + mailBoxLocal::mailBoxLocal() + : status(0), + target(0) {} + + bool mailBoxLocal::gotNotification() { + unsigned long statusOld = status; + return (statusOld < target) && status.compare_exchange_strong(statusOld, statusOld + 1); + } + + void mailBoxLocal::notify() { + ++target; + } + } +} diff --git a/src/gpi_comm_lib/collectives/lib/mailBoxLocal.h b/src/gpi_comm_lib/collectives/lib/mailBoxLocal.h new file mode 100755 index 00000000..0d9b34fa --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/mailBoxLocal.h @@ -0,0 +1,22 @@ +#pragma once + +#include "mailBox.h" +#include + +namespace tarantella +{ + namespace collectives + { + class mailBoxLocal : public mailBox + { + public: + mailBoxLocal(); + bool gotNotification() override; + void notify(); + + private: + std::atomic status; + std::atomic target; + }; + } +} diff --git a/src/gpi_comm_lib/collectives/lib/queues.cpp b/src/gpi_comm_lib/collectives/lib/queues.cpp new file mode 100755 index 00000000..d8feb3cb --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/queues.cpp @@ -0,0 +1,59 @@ +#include "queues.h" +#include "gpi/gaspiCheckReturn.hpp" + +namespace tarantella +{ + namespace collectives + { + using tarantella::GPI::gaspiCheckReturn; + + queues::queues(const long num, + const gaspi_queue_id_t first) + : numQueues(num) + , state(0) { + for (long i=first; i < first + num; i++) { + queueStock.push_back(i); + } + } + + queues::queues(const std::vector& queues_) + : numQueues(queues_.size()), + state(0), + queueStock(queues_) { + } + + gaspi_queue_id_t queues::get() const { + return stateToQueue(state); + } + + inline gaspi_queue_id_t queues::stateToQueue(const long state_) const { + return queueStock[state_ % numQueues]; + } + + gaspi_queue_id_t queues::swap(gaspi_queue_id_t badQueue) { + const long stateLocal = state; + const gaspi_queue_id_t queueLocal = stateToQueue(stateLocal); + + if (queueLocal != badQueue) { + return queueLocal; + } else { + const long stateLocalNew = stateLocal + 1; + const gaspi_queue_id_t queueLocalNew = stateToQueue(stateLocalNew); + + clearQueue(queueLocalNew); + + const long stateBeforeSwap = + __sync_val_compare_and_swap(&state, stateLocal, stateLocalNew); + + return (stateBeforeSwap == stateLocal) + ? queueLocalNew + : stateToQueue(stateBeforeSwap); + }; + } + + inline void queues::clearQueue(const gaspi_queue_id_t queue) { + gaspiCheckReturn(gaspi_wait(queue, GASPI_BLOCK), + "Failed to clear queue with "); + } + } +} diff --git a/src/gpi_comm_lib/collectives/lib/queues.h b/src/gpi_comm_lib/collectives/lib/queues.h new file mode 100755 index 00000000..0679a7d0 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/queues.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include + +namespace tarantella +{ + namespace collectives + { + class queues { + public: + queues(const long num = 2, + const gaspi_queue_id_t first = 0); + queues(const std::vector& queues_); + + gaspi_queue_id_t get() const; + gaspi_queue_id_t swap(gaspi_queue_id_t badQueue); + + private: + inline gaspi_queue_id_t stateToQueue(const long) const; + inline void clearQueue(const gaspi_queue_id_t queue); + + static const long CACHE_LINE_SIZE = 64; + const long numQueues; + + char pad0 [CACHE_LINE_SIZE]; + volatile long state; + char pad1 [CACHE_LINE_SIZE]; + + std::vector queueStock; + }; + } +} diff --git a/src/gpi_comm_lib/collectives/lib/reduce.cpp b/src/gpi_comm_lib/collectives/lib/reduce.cpp new file mode 100755 index 00000000..6e1ba9c8 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/reduce.cpp @@ -0,0 +1,188 @@ +#include "reduce.h" + +#include +#include + +namespace tarantella +{ + namespace collectives + { + namespace + { + template + inline void add(const reduce::task& t) { + const T* const a = (const T*) t.source; + T* const b = (T*) t.destination; + const long n = t.len; + + for (long i=0; i < n; i++) { + b[i] += a[i]; + } + } + + template + inline void average(const reduce::task& t) { + if (t.scaling > 1) { + const T* const a = (const T*) t.source; + T* const b = (T*) t.destination; + const long n = t.len; + const T s = t.scaling; + + for (long i=0; i < n; i++) { + b[i] = (b[i] + a[i]) / s; + } + } else { + add(t); + } + } + + template + inline void averageopt(const reduce::task& t) { + if (t.scaling > 1) { + const T* const a = (const T*) t.source; + T* const b = (T*) t.destination; + const long n = t.len; + const T s = T(1) / T(t.scaling); + + for (long i=0; i < n; i++) { + b[i] = (b[i] + a[i]) * s; + } + } else { + add(t); + } + } + + class reduce_float_sum : public reduce { + public: + void operator()(const task& t) const { + add(t); + } + }; + + class reduce_float_average : public reduce { + public: + void operator()(const task& t) const { + averageopt(t); + } + }; + + class reduce_double_sum : public reduce { + public: + void operator()(const task& t) const { + add(t); + } + }; + + class reduce_double_average : public reduce { + public: + void operator()(const task& t) const { + averageopt(t); + } + }; + + class reduce_int16_sum : public reduce { + public: + void operator()(const task& t) const { + add(t); + } + }; + + class reduce_int16_average : public reduce { + public: + void operator()(const task& t) const { + average(t); + } + }; + + class reduce_int32_sum : public reduce { + public: + void operator()(const task& t) const { + add(t); + } + }; + + class reduce_int32_average : public reduce { + public: + void operator()(const task& t) const { + average(t); + } + }; + } + + reduce * getReduce(const allreduce::dataType data, + const allreduce::reductionType reduction) { + reduce* p = NULL; + + switch (data) { + case allreduce::FLOAT: + switch (reduction) { + case allreduce::SUM: + p = new reduce_float_sum(); + break; + case allreduce::AVERAGE: + p = new reduce_float_average(); + break; + default: + break; + } + break; + case allreduce::DOUBLE: + switch (reduction) { + case allreduce::SUM: + p = new reduce_double_sum; + break; + case allreduce::AVERAGE: + p = new reduce_double_average; + break; + default: + break; + } + break; + case allreduce::INT16: + switch (reduction) { + case allreduce::SUM: + p = new reduce_int16_sum; + break; + case allreduce::AVERAGE: + p = new reduce_int16_average; + break; + default: + break; + } + break; + case allreduce::INT32: + switch (reduction) { + case allreduce::SUM: + p = new reduce_int32_sum; + break; + case allreduce::AVERAGE: + p = new reduce_int32_average; + break; + default: + break; + } + break; + default: + break; + }; + + if (p == NULL) { + throw std::runtime_error( + "Unsupported combination of data type and reduction type"); + } + + return p; + } + + size_t getDataTypeSize(const allreduce::dataType d) { + const size_t sizes[allreduce::NUM_TYPE] = { + sizeof(float), + sizeof(double), + sizeof(int16_t), + sizeof(int32_t) + }; + + return sizes[d]; + } + } +} diff --git a/src/gpi_comm_lib/collectives/lib/reduce.h b/src/gpi_comm_lib/collectives/lib/reduce.h new file mode 100755 index 00000000..e0b7ea91 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/reduce.h @@ -0,0 +1,34 @@ +#pragma once + +#include "allreduce.h" + +#include + +namespace tarantella +{ + namespace collectives + { + class reduce { + public: + struct task { + const void* source; + void* destination; + long len; + unsigned long scaling; + task(const void* s = NULL, + void* d = NULL, + long n = 0, + unsigned long sc = 0) + : source(s), destination(d), len(n), scaling(sc) {} + }; + + virtual void operator()(const task& t) const = 0; + virtual ~reduce() {} + }; + + reduce * getReduce(const allreduce::dataType data, + const allreduce::reductionType reduction); + + size_t getDataTypeSize(const allreduce::dataType d); + } +} diff --git a/src/gpi_comm_lib/collectives/lib/writer.cpp b/src/gpi_comm_lib/collectives/lib/writer.cpp new file mode 100755 index 00000000..ed5ad550 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/writer.cpp @@ -0,0 +1,80 @@ +#include "writer.h" +#include "gpi/gaspiCheckReturn.hpp" + +#include + +namespace tarantella +{ + namespace collectives + { + const gaspi_size_t writer::MESSAGE_LENGTH_LIMIT = 0x40000000; + + using tarantella::GPI::gaspiCheckReturn; + + writer::transferParameters::transferParameters( + bool a, + gaspi_rank_t r, + gaspi_segment_id_t sl, + gaspi_offset_t ol, + gaspi_segment_id_t sr, + gaspi_offset_t orm, + gaspi_size_t sz, + gaspi_notification_id_t id) + : active(a), + rank(r), + segmentLocal(sl), + offsetLocal(ol), + segmentRemote(sr), + offsetRemote(orm), + size(sz), + notificationID(id) + {} + + std::ostream& writer::transferParameters::report(std::ostream& s) const { + if (active) { + s << "rank " << rank + << " | sl " << long(segmentLocal) + << " ol " << offsetLocal + << " | sr " << long(segmentRemote) + << " or " << offsetRemote + << " ID " << notificationID + << " | sz " << size; + } else { + s << "idle"; + } + return s; + } + + writer::writer(queues& queues_) + : queueSource(queues_) {} + + void writer::operator()(const transferParameters& p) { + if (!p.active) return; + //thread save? watch queue management! + + if (p.size > MESSAGE_LENGTH_LIMIT) { + throw std::runtime_error("writer: message is too long"); + } + + gaspi_return_t err; + gaspi_queue_id_t queueLocal = queueSource.get(); + while ((err = gaspi_write_notify(p.segmentLocal, + p.offsetLocal, + p.rank, + p.segmentRemote, + p.offsetRemote, + p.size, + p.notificationID, + 1, + queueLocal, + GASPI_BLOCK)) + != GASPI_SUCCESS) { + if (err == GASPI_QUEUE_FULL) { + queueLocal = queueSource.swap(queueLocal); + } else { + gaspiCheckReturn(err, "gaspi_write_notify failed with "); + } + } + } + } +} diff --git a/src/gpi_comm_lib/collectives/lib/writer.h b/src/gpi_comm_lib/collectives/lib/writer.h new file mode 100755 index 00000000..db580448 --- /dev/null +++ b/src/gpi_comm_lib/collectives/lib/writer.h @@ -0,0 +1,46 @@ +#pragma once + +#include "queues.h" + +#include +#include + +namespace tarantella +{ + namespace collectives + { + class writer { + public: + struct transferParameters { + bool active; + gaspi_rank_t rank; + gaspi_segment_id_t segmentLocal; + gaspi_offset_t offsetLocal; + gaspi_segment_id_t segmentRemote; + gaspi_offset_t offsetRemote; + gaspi_size_t size; + gaspi_notification_id_t notificationID; + transferParameters( + bool a = false, + gaspi_rank_t r = 0, + gaspi_segment_id_t sl = 0, + gaspi_offset_t ol = 0, + gaspi_segment_id_t sr = 0, + gaspi_offset_t orm = 0, + gaspi_size_t sz = 0, + gaspi_notification_id_t id = 0); + std::ostream& report(std::ostream& s) const; + }; + + writer(queues& queues_); + void operator()(const transferParameters& p); + + private: + + static const gaspi_size_t MESSAGE_LENGTH_LIMIT; + + queues& queueSource; + }; + + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/distribution/GroupBuilder.hpp b/src/gpi_comm_lib/distribution/GroupBuilder.hpp new file mode 100644 index 00000000..a3b78b8f --- /dev/null +++ b/src/gpi_comm_lib/distribution/GroupBuilder.hpp @@ -0,0 +1,34 @@ +#pragma once + +#include "gpi/Context.hpp" +#include "gpi/ResourceManager.hpp" + +#include + +namespace tarantella +{ + namespace distribution + { + class DataParallelGroupBuilder + { + public: + DataParallelGroupBuilder(GPI::Context& context) + : context(context) + { } + + GPI::Group const get_group() + { + auto& resource_manager = context.get_resource_manager(); + auto const num_ranks = context.get_comm_size(); + + std::vector all_ranks(num_ranks); + std::iota(all_ranks.begin(), all_ranks.end(), static_cast(0)); + + return resource_manager.make_group(all_ranks); + } + + private: + GPI::Context& context; + }; + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/distribution/SegmentIDBuilder.cpp b/src/gpi_comm_lib/distribution/SegmentIDBuilder.cpp new file mode 100644 index 00000000..0ddbac0f --- /dev/null +++ b/src/gpi_comm_lib/distribution/SegmentIDBuilder.cpp @@ -0,0 +1,19 @@ +#include "SegmentIDBuilder.hpp" + +namespace tarantella +{ + namespace distribution + { + GPI::SegmentID DataParallelSegmentIDBuilder::segment_id = 0UL; + + GPI::SegmentID DataParallelSegmentIDBuilder::get_segment_id() + { + return segment_id++; + } + + GPI::SegmentID PipelineSegmentIDBuilder::get_segment_id(PipelineCommunicator::ConnectionID id) + { + return static_cast(id); + } + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/distribution/SegmentIDBuilder.hpp b/src/gpi_comm_lib/distribution/SegmentIDBuilder.hpp new file mode 100644 index 00000000..2b46dc73 --- /dev/null +++ b/src/gpi_comm_lib/distribution/SegmentIDBuilder.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "PipelineCommunicator.hpp" + +namespace tarantella +{ + namespace distribution + { + class DataParallelSegmentIDBuilder + { + public: + DataParallelSegmentIDBuilder() = default; + + GPI::SegmentID get_segment_id(); + + private: + static GPI::SegmentID segment_id; + }; + + class PipelineSegmentIDBuilder + { + public: + PipelineSegmentIDBuilder() = default; + + GPI::SegmentID get_segment_id(PipelineCommunicator::ConnectionID id); + }; + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/distribution/utilities.cpp b/src/gpi_comm_lib/distribution/utilities.cpp new file mode 100644 index 00000000..6b383dc4 --- /dev/null +++ b/src/gpi_comm_lib/distribution/utilities.cpp @@ -0,0 +1,23 @@ +#include "utilities.hpp" + +#include +#include + +namespace tarantella +{ + namespace distribution + { + std::size_t get_segment_size(std::vector const& DNN, double overhead_factor) + { + if(DNN.size() == 0) + { + throw std::logic_error("tarantella::get_segment_size: Empty DNN to SynchCommunicator provided"); + } + + auto add_tensor_size_in_bytes = [](auto sum, auto tensor_info){ + return sum + (tensor_info.get_nelems() * getDataTypeSize(tensor_info.get_elem_type())); }; + auto const partition_size = std::accumulate(DNN.begin(), DNN.end(), 0UL, add_tensor_size_in_bytes); + return overhead_factor * partition_size; + } + } +} diff --git a/src/gpi_comm_lib/distribution/utilities.hpp b/src/gpi_comm_lib/distribution/utilities.hpp new file mode 100644 index 00000000..7ecdf542 --- /dev/null +++ b/src/gpi_comm_lib/distribution/utilities.hpp @@ -0,0 +1,15 @@ +#pragma once + +#include "collectives/TensorInfo.hpp" + +#include +#include + +namespace tarantella +{ + namespace distribution + { + std::size_t get_segment_size(std::vector const& DNN, double overhead_factor); + } +} + diff --git a/src/gpi_comm_lib/gpi/CMakeLists.txt b/src/gpi_comm_lib/gpi/CMakeLists.txt new file mode 100644 index 00000000..ff442578 --- /dev/null +++ b/src/gpi_comm_lib/gpi/CMakeLists.txt @@ -0,0 +1,29 @@ +include (add_macros) + +set (GPIRESOURCES_SOURCES + ${SRC_DIR}/gpi_comm_lib/gpi/Context.cpp + ${SRC_DIR}/gpi_comm_lib/gpi/Group.cpp + ${SRC_DIR}/gpi_comm_lib/gpi/GroupManager.cpp + ${SRC_DIR}/gpi_comm_lib/gpi/NotificationManager.cpp + ${SRC_DIR}/gpi_comm_lib/gpi/QueueManager.cpp + ${SRC_DIR}/gpi_comm_lib/gpi/ResourceManager.cpp + ${SRC_DIR}/gpi_comm_lib/gpi/Segment.cpp + ${SRC_DIR}/gpi_comm_lib/gpi/SegmentBuffer.cpp + ${SRC_DIR}/gpi_comm_lib/gpi/SegmentManager.cpp + ${SRC_DIR}/gpi_comm_lib/gpi/gaspiCheckReturn.cpp +) +extended_add_library(NAME gpiresources + NAMESPACE tnt + TYPE SHARED + SOURCES + ${GPIRESOURCES_SOURCES} + LIBRARIES + optimized GPI2::GPI2 + debug GPI2::GPI2dbg + INCLUDE_DIRECTORIES + ${SRC_DIR}/gpi_comm_lib/ + INSTALL + INSTALL_DESTINATION + ${INSTALL_LIB_DIR} + POSITION_INDEPENDENT) + diff --git a/src/gpi_comm_lib/gpi/Context.cpp b/src/gpi_comm_lib/gpi/Context.cpp new file mode 100644 index 00000000..bc1ccee1 --- /dev/null +++ b/src/gpi_comm_lib/gpi/Context.cpp @@ -0,0 +1,92 @@ +#include "Context.hpp" + +#include "gpi/gaspiCheckReturn.hpp" +#include "gpi/Group.hpp" +#include "gpi/ResourceManager.hpp" + +#include +#include + +namespace tarantella +{ + namespace GPI + { + Context::Context() + : rank(0), comm_size(0) + { + gaspiCheckReturn(gaspi_proc_init(GASPI_BLOCK), + "GPI library initialization"); + gaspiCheckReturn(gaspi_proc_rank(&rank), + "get rank"); + gaspi_rank_t size; // gaspi_proc_num expects gaspi_rank_t + gaspiCheckReturn(gaspi_proc_num(&size), + "get number of processes"); + comm_size = size; + } + + Context::~Context() + { + gaspiCheckReturn(gaspi_barrier(GASPI_GROUP_ALL, timeout_millis), + "gaspi_barrier"); + gaspiCheckReturn(gaspi_proc_term(GASPI_BLOCK), + "GPI library finalize"); + } + + Rank Context::get_rank() const + { + return rank; + } + + std::size_t Context::get_comm_size() const + { + return comm_size; + } + + tarantella::GPI::ResourceManager& Context::get_resource_manager() + { + return tarantella::GPI::ResourceManager::get_instance(*this); + } + + void Context::allocate_segment(SegmentID id, Group const& group, std::size_t total_size) + { + if (total_size == 0) + { + throw std::runtime_error("Context::allocate_segment : Cannot allocate segment of size zero"); + } + + if (!group.contains_rank(get_rank())) + { + throw std::runtime_error("Context::allocate_segment : Group does not contain rank"); + } + + gaspiCheckReturn(gaspi_segment_alloc(id, total_size, GASPI_MEM_UNINITIALIZED), + "Context::allocate_segment : segment could not be allocated"); + for (auto other_rank : group.get_ranks()) + { + if (other_rank != get_rank()) + { + gaspiCheckReturn(gaspi_segment_register(id, other_rank, GASPI_BLOCK), + "Context::allocate_segment : segment could not be registered"); + } + } + } + + void Context::deallocate_segment(SegmentID id, Group const& group) + { + if (!group.contains_rank(get_rank())) + { + throw std::runtime_error("Context::deallocate_segment : Group does not contain rank"); + } + gaspiCheckReturn(gaspi_segment_delete(id), + "Context::deallocate_segment : segment could not be deleted"); + } + + void* Context::get_segment_pointer(SegmentID id) const + { + void* p; + gaspiCheckReturn(gaspi_segment_ptr(id, &p), "get pointer within segment"); + return p; + } + } +} + diff --git a/src/gpi_comm_lib/gpi/Context.hpp b/src/gpi_comm_lib/gpi/Context.hpp new file mode 100644 index 00000000..4c37b904 --- /dev/null +++ b/src/gpi_comm_lib/gpi/Context.hpp @@ -0,0 +1,40 @@ +#pragma once + +#include "Types.hpp" + +#include + +#include +#include + +namespace tarantella +{ + namespace GPI + { + class Group; + class ResourceManager; + + class Context + { + public: + + Context(); + Context(Context const& other) = delete; + Context& operator=(Context const& other) = delete; + ~Context(); + + Rank get_rank() const; + std::size_t get_comm_size() const; + tarantella::GPI::ResourceManager& get_resource_manager(); + + void allocate_segment(SegmentID id, Group const&, std::size_t total_size); + void deallocate_segment(SegmentID id, Group const&); + void* get_segment_pointer(SegmentID id) const; + + private: + Rank rank; + std::size_t comm_size; + size_t const timeout_millis = 1000; + }; + } +} diff --git a/src/gpi_comm_lib/gpi/Group.cpp b/src/gpi_comm_lib/gpi/Group.cpp new file mode 100644 index 00000000..80bbebb1 --- /dev/null +++ b/src/gpi_comm_lib/gpi/Group.cpp @@ -0,0 +1,40 @@ +#include "Group.hpp" + +#include + +#include +#include +#include +#include + +namespace tarantella +{ + namespace GPI + { + Group::Group(std::vector const &ranks_to_add) + : ranks(ranks_to_add) + { + if (ranks.size() == 0) + { + throw std::runtime_error("Group: Cannot create empty group"); + } + std::sort(ranks.begin(), ranks.end()); + } + + std::size_t Group::get_size() const + { + return ranks.size(); + } + + bool Group::contains_rank(Rank rank) const + { + auto const iter = std::find(ranks.begin(), ranks.end(), rank); + return iter != ranks.end(); + } + + std::vector const& Group::get_ranks() const + { + return ranks; + } + } +} diff --git a/src/gpi_comm_lib/gpi/Group.hpp b/src/gpi_comm_lib/gpi/Group.hpp new file mode 100644 index 00000000..535d11d0 --- /dev/null +++ b/src/gpi_comm_lib/gpi/Group.hpp @@ -0,0 +1,27 @@ +#pragma once + +#include "Types.hpp" + +#include + +#include +#include + +namespace tarantella +{ + namespace GPI + { + class Group + { + public: + Group(std::vector const&); + + std::size_t get_size() const; + bool contains_rank(Rank) const; + std::vector const& get_ranks() const; + + private: + std::vector ranks; + }; + } +} diff --git a/src/gpi_comm_lib/gpi/GroupManager.cpp b/src/gpi_comm_lib/gpi/GroupManager.cpp new file mode 100644 index 00000000..693f24e7 --- /dev/null +++ b/src/gpi_comm_lib/gpi/GroupManager.cpp @@ -0,0 +1,18 @@ +#include "GroupManager.hpp" + +namespace tarantella +{ + namespace GPI + { + GPI::Group const GroupManager::create_group(std::vector const& ranks) + { + groups.emplace_back(ranks); + return groups.back(); + } + + std::vector const& GroupManager::get_groups() const + { + return groups; + } + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/gpi/GroupManager.hpp b/src/gpi_comm_lib/gpi/GroupManager.hpp new file mode 100644 index 00000000..6962f473 --- /dev/null +++ b/src/gpi_comm_lib/gpi/GroupManager.hpp @@ -0,0 +1,28 @@ +#pragma once + +#include "Types.hpp" +#include "Group.hpp" + +#include +#include + +namespace tarantella +{ + namespace GPI + { + class GroupManager + { + public: + GroupManager() = default; + GroupManager(GroupManager const&) = delete; + GroupManager& operator=(GroupManager const&) = delete; + ~GroupManager() = default; + + GPI::Group const create_group(std::vector const&); + std::vector const& get_groups() const; + + private: + std::vector groups; + }; + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/gpi/NotificationManager.cpp b/src/gpi_comm_lib/gpi/NotificationManager.cpp new file mode 100644 index 00000000..a38eb096 --- /dev/null +++ b/src/gpi_comm_lib/gpi/NotificationManager.cpp @@ -0,0 +1,59 @@ +#include "gpi/NotificationManager.hpp" +#include "gaspiCheckReturn.hpp" + +#include + +#include + +namespace tarantella +{ + namespace GPI + { + namespace + { + std::size_t get_number_available_notifications() + { + gaspi_number_t notifications_available; + gaspiCheckReturn(gaspi_notification_num(¬ifications_available), + "[NotificationManager::get_number_available_notifications()] GASPI:\ + Could not get number of available notifications"); + return notifications_available; + } + } + + NotificationManager::NotificationManager() + : max_notification_id(get_number_available_notifications()), next_notification_ids() + { } + + void NotificationManager::register_segment(GPI::SegmentID id) + { + if(next_notification_ids.find(id) != next_notification_ids.end()) + { + throw std::runtime_error("[NotificationManager::register_segment]:\ + Segment already registered"); + } + next_notification_ids[id] = 0UL; + } + + NotificationManager::NotificationRange + NotificationManager::get_notification_range(GPI::SegmentID id, std::size_t size) + { + if(next_notification_ids.find(id) == next_notification_ids.end()) + { + throw std::runtime_error("[NotificationManager::get_notification_range]:\ + Segment not registered"); + } + + if(next_notification_ids[id] + size > max_notification_id) + { + throw std::runtime_error("[NotificationManager::get_notification_range]:\ + Not enough notifications left"); + } + + NotificationManager::NotificationRange const range = {next_notification_ids[id], + next_notification_ids[id] + size}; + next_notification_ids[id] += size; + return range; + } + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/gpi/NotificationManager.hpp b/src/gpi_comm_lib/gpi/NotificationManager.hpp new file mode 100644 index 00000000..00e05105 --- /dev/null +++ b/src/gpi_comm_lib/gpi/NotificationManager.hpp @@ -0,0 +1,32 @@ +#pragma once + +#include "Context.hpp" + +#include +#include +#include + +namespace tarantella +{ + namespace GPI + { + class NotificationManager + { + public: + using NotificationID = std::size_t; + using NotificationRange = std::pair; + + NotificationManager(); + NotificationManager(NotificationManager const&) = delete; + NotificationManager& operator=(NotificationManager const &) = delete; + ~NotificationManager() = default; + + void register_segment(GPI::SegmentID); + NotificationRange get_notification_range(GPI::SegmentID, std::size_t); + + private: + std::size_t const max_notification_id; + std::unordered_map next_notification_ids; + }; + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/gpi/QueueManager.cpp b/src/gpi_comm_lib/gpi/QueueManager.cpp new file mode 100644 index 00000000..2f7e16d1 --- /dev/null +++ b/src/gpi_comm_lib/gpi/QueueManager.cpp @@ -0,0 +1,129 @@ +#include "QueueManager.hpp" + +#include "gpi/gaspiCheckReturn.hpp" + +#include + +#include +#include +#include +#include + +namespace tarantella +{ + namespace GPI + { + namespace + { + std::size_t get_slots_per_gaspi_queue() + { + gaspi_number_t slots; + gaspiCheckReturn(gaspi_queue_size_max(&slots), + "[QueueManager::get_slots_per_gaspi_queue()] GASPI:\ + Error in gaspi_queue_size_max"); + return slots; + } + + std::size_t get_number_allocated_gaspi_queues() + { + gaspi_number_t number_queues; + gaspiCheckReturn(gaspi_queue_num(&number_queues), + "[QueueManager::get_number_allocated_gaspi_queues()] GASPI:\ + Could not get number of allocated queues"); + return static_cast(number_queues); + } + + std::size_t get_number_gaspi_queues() + { + std::size_t const number_queues_want_to_use = 10; + gaspi_number_t number_queues_allowed; + gaspiCheckReturn(gaspi_queue_max(&number_queues_allowed), + "[QueueManager::get_number_gaspi_queues()] GASPI:\ + Could not get max number of queues"); + return std::min(number_queues_want_to_use, + static_cast(number_queues_allowed)); + } + + auto queue_has_two_empty_slots(std::size_t total_slots) + { + return [total_slots](auto queue) + { + gaspi_number_t non_empty_slots; + gaspiCheckReturn(gaspi_queue_size(queue, &non_empty_slots), + "[QueueManager::queue_has_two_empty_slots()] GASPI:\ + Error in gaspi_queue_size"); + return total_slots >= non_empty_slots + 2; + }; + } + } + + QueueManager& QueueManager::get_instance() + { + static auto instance = new QueueManager(); + return *instance; + } + + QueueManager::QueueManager() + : num_preallocated_queues(get_number_allocated_gaspi_queues()), + gaspi_queues(get_number_gaspi_queues()), + slots_per_gaspi_queue(get_slots_per_gaspi_queue()), + rng(std::random_device()()) + { + auto const end = std::min(gaspi_queues.size(), num_preallocated_queues); + std::iota(gaspi_queues.begin(), gaspi_queues.begin() + end, 0); + + // allocate remaining queues + if (num_preallocated_queues < gaspi_queues.size()) + { + auto const start_unallocated_queues_it = gaspi_queues.begin() + num_preallocated_queues; + auto const num_unallocated_queues = gaspi_queues.size() - num_preallocated_queues; + std::generate_n(start_unallocated_queues_it, num_unallocated_queues, + []() { + gaspi_queue_id_t q; + gaspiCheckReturn(gaspi_queue_create(&q, GASPI_BLOCK), + "[QueueManager::QueueManager()] GASPI:\ + Could not create queue"); + return q; + }); + } + } + + QueueManager::~QueueManager() + { + wait_and_flush_queue(); + + // only delete the queues allocated by the manager + std::sort(gaspi_queues.begin(), gaspi_queues.end(), std::greater()); + if (num_preallocated_queues < gaspi_queues.size()) + { + for (auto q = gaspi_queues.begin(); q != gaspi_queues.end() - num_preallocated_queues; ++q) + { + gaspiCheckReturn(gaspi_queue_delete(*q), + "[QueueManager::QueueManager()] GASPI: Could not delete queue"); + } + } + } + + QueueID QueueManager::get_queue_id_for_write_notify() + { + std::shuffle(gaspi_queues.begin(), gaspi_queues.end(), rng); + auto const valid_queue = std::find_if(gaspi_queues.begin(), gaspi_queues.end(), + queue_has_two_empty_slots(slots_per_gaspi_queue)); + if(valid_queue != gaspi_queues.end()) return *valid_queue; + else return wait_and_flush_queue(gaspi_queues.front()); + } + + void QueueManager::wait_and_flush_queue() + { + for(auto q : gaspi_queues) wait_and_flush_queue(q); + } + + QueueID QueueManager::wait_and_flush_queue(QueueID id) + { + gaspiCheckReturn(gaspi_wait(id, GASPI_BLOCK), + "[QueueManager::wait_and_flush_queue()] GASPI:\ + Error while waiting on queue"); + return id; + } + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/gpi/QueueManager.hpp b/src/gpi_comm_lib/gpi/QueueManager.hpp new file mode 100644 index 00000000..3d14d6e5 --- /dev/null +++ b/src/gpi_comm_lib/gpi/QueueManager.hpp @@ -0,0 +1,37 @@ +#pragma once + +#include "gpi/Types.hpp" + +#include + +#include +#include + +namespace tarantella +{ + namespace GPI + { + class QueueManager + { + public: + static QueueManager& get_instance(); + QueueManager(QueueManager const&) = delete; + QueueManager& operator=(QueueManager const&) = delete; + ~QueueManager(); + + QueueID get_queue_id_for_write_notify(); + void wait_and_flush_queue(); + + private: + QueueManager(); + QueueID wait_and_flush_queue(QueueID); + + // Assumption: the IDs of the preallocated queues are in the + // [0, num_preallocated_queues-1) range + std::size_t const num_preallocated_queues; + std::vector gaspi_queues; + std::size_t const slots_per_gaspi_queue; + std::mt19937 rng; + }; + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/gpi/ResourceManager.cpp b/src/gpi_comm_lib/gpi/ResourceManager.cpp new file mode 100644 index 00000000..5f07902c --- /dev/null +++ b/src/gpi_comm_lib/gpi/ResourceManager.cpp @@ -0,0 +1,57 @@ +#include "ResourceManager.hpp" + +#include +#include + +namespace tarantella +{ + namespace GPI + { + ResourceManager& ResourceManager::get_instance(GPI::Context& context) + { + static auto instance = new ResourceManager(context); + return *instance; + } + + ResourceManager::ResourceManager(GPI::Context& context) + : queueManager(GPI::QueueManager::get_instance()), + groupManager(), notificationManager(), segmentManager(context) + { } + + void ResourceManager::make_segment_resources(GPI::SegmentID id, GPI::Group const& group, std::size_t size) + { + segmentManager.create_segment(id, group, size); + notificationManager.register_segment(id); + } + + GPI::Group const ResourceManager::make_group(std::vector const& ranks) + { + return groupManager.create_group(ranks); + } + + std::vector const& ResourceManager::get_groups() const + { + return groupManager.get_groups(); + } + + GPI::QueueID ResourceManager::get_queue_id_for_write_notify() + { + return queueManager.get_queue_id_for_write_notify(); + } + + void ResourceManager::wait_and_flush_queue() + { + queueManager.wait_and_flush_queue(); + } + + GPI::NotificationRange ResourceManager::get_notification_range(GPI::SegmentID id, std::size_t s) + { + return notificationManager.get_notification_range(id, s); + } + + GPI::SegmentBuffer ResourceManager::get_buffer_of_size(GPI::SegmentID id, std::size_t s) + { + return segmentManager.get_buffer_of_size(id, s); + } + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/gpi/ResourceManager.hpp b/src/gpi_comm_lib/gpi/ResourceManager.hpp new file mode 100644 index 00000000..ef8bc753 --- /dev/null +++ b/src/gpi_comm_lib/gpi/ResourceManager.hpp @@ -0,0 +1,47 @@ +#pragma once + +#include "gpi/Context.hpp" +#include "gpi/GroupManager.hpp" +#include "gpi/NotificationManager.hpp" +#include "gpi/QueueManager.hpp" +#include "gpi/SegmentManager.hpp" +#include "gpi/SegmentBuffer.hpp" +#include "gpi/Types.hpp" + +#include + +#include +#include +#include + +namespace tarantella +{ + namespace GPI + { + class ResourceManager + { + public: + static ResourceManager &get_instance(GPI::Context &); + ResourceManager() = delete; + ResourceManager(ResourceManager const&) = delete; + ResourceManager& operator=(ResourceManager const&) = delete; + ~ResourceManager() = default; + + void make_segment_resources(GPI::SegmentID, GPI::Group const&, std::size_t); + GPI::Group const make_group(std::vector const&); + std::vector const& get_groups() const; + GPI::QueueID get_queue_id_for_write_notify(); + void wait_and_flush_queue(); + GPI::NotificationRange get_notification_range(GPI::SegmentID, std::size_t); + GPI::SegmentBuffer get_buffer_of_size(GPI::SegmentID, std::size_t); + + private: + ResourceManager(GPI::Context&); + + GPI::QueueManager& queueManager; + GPI::GroupManager groupManager; + GPI::NotificationManager notificationManager; + GPI::SegmentManager segmentManager; + }; + } +} diff --git a/src/gpi_comm_lib/gpi/Segment.cpp b/src/gpi_comm_lib/gpi/Segment.cpp new file mode 100644 index 00000000..257a636c --- /dev/null +++ b/src/gpi_comm_lib/gpi/Segment.cpp @@ -0,0 +1,39 @@ +#include "Segment.hpp" + +namespace tarantella +{ + namespace GPI + { + Segment::Segment(Context& context, + Group const& group, + SegmentID id, + std::size_t size): + context(context), group(group), id(id), + size(size), ptr(nullptr) + { + context.allocate_segment(id, group, size); + ptr = context.get_segment_pointer(id); + } + + Segment::~Segment() + { + context.deallocate_segment(id, group); + } + + SegmentID Segment::get_id() const + { + return id; + } + + std::size_t Segment::get_size() const + { + return size; + } + + void* Segment::get_ptr() const + { + return ptr; + } + } +} + diff --git a/src/gpi_comm_lib/gpi/Segment.hpp b/src/gpi_comm_lib/gpi/Segment.hpp new file mode 100644 index 00000000..f3ebed12 --- /dev/null +++ b/src/gpi_comm_lib/gpi/Segment.hpp @@ -0,0 +1,40 @@ +#pragma once + +#include "Context.hpp" +#include "Group.hpp" + +#include + +#include + +namespace tarantella +{ + namespace GPI + { + class Segment + { + public: + + Segment(Context& context, Group const&, SegmentID, std::size_t ); + Segment(Segment const& other) = delete; + Segment& operator=(Segment const& other) = delete; + Segment(Segment&& other) = delete; + Segment& operator=(Segment&& other) = delete; + ~Segment(); + + std::size_t get_size() const; + SegmentID get_id() const; + void* get_ptr() const; + + private: + + Context& context; + Group const group; + + SegmentID const id; + std::size_t const size; + void* /* const */ ptr; + }; + } +} + diff --git a/src/gpi_comm_lib/gpi/SegmentBuffer.cpp b/src/gpi_comm_lib/gpi/SegmentBuffer.cpp new file mode 100644 index 00000000..e2539a1d --- /dev/null +++ b/src/gpi_comm_lib/gpi/SegmentBuffer.cpp @@ -0,0 +1,21 @@ + +#include "SegmentBuffer.hpp" + +#include + +namespace tarantella +{ + namespace GPI + { + SegmentBuffer::SegmentBuffer(GPI::Segment const& s, std::size_t offset, std::size_t size) + : id(s.get_id()), offset(offset), size(size), + ptr(reinterpret_cast(s.get_ptr()) + offset) + { } + + SegmentID SegmentBuffer::get_segment_id() const { return id; } + std::size_t SegmentBuffer::get_size() const { return size; } + std::size_t SegmentBuffer::get_offset() const { return offset; } + void* SegmentBuffer::get_ptr() const { return ptr; } + + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/gpi/SegmentBuffer.hpp b/src/gpi_comm_lib/gpi/SegmentBuffer.hpp new file mode 100644 index 00000000..e0451286 --- /dev/null +++ b/src/gpi_comm_lib/gpi/SegmentBuffer.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include "Segment.hpp" + +namespace tarantella +{ + namespace GPI + { + class SegmentBuffer + { + public: + SegmentBuffer(GPI::Segment const &s, std::size_t offset, std::size_t size); + SegmentBuffer(SegmentBuffer const& other) = default; + SegmentBuffer& operator=(SegmentBuffer const&) = delete; + SegmentBuffer(SegmentBuffer&&) = default; + SegmentBuffer& operator=(SegmentBuffer&&) = delete; + ~SegmentBuffer() = default; + + SegmentID get_segment_id() const; + std::size_t get_size() const; + std::size_t get_offset() const; + void* get_ptr() const; + + private: + SegmentID const id; + std::size_t const offset; + std::size_t const size; + void* const ptr; + }; + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/gpi/SegmentManager.cpp b/src/gpi_comm_lib/gpi/SegmentManager.cpp new file mode 100644 index 00000000..239ae493 --- /dev/null +++ b/src/gpi_comm_lib/gpi/SegmentManager.cpp @@ -0,0 +1,45 @@ +#include "SegmentManager.hpp" + +#include +#include + +namespace tarantella +{ + namespace GPI + { + SegmentManager::SegmentManager(GPI::Context& context) + : context(context), segments() + { } + + void SegmentManager::create_segment(GPI::SegmentID id, GPI::Group const& group, std::size_t size) + { + if(segments.find(id) != segments.end()) + { + throw std::runtime_error("[SegmentManager::create_segment]:\ + Segment already exists"); + } + segments.emplace(std::make_pair(id, AllocatedSegment(context, group, id, size, 0UL))); + } + + SegmentBuffer SegmentManager::get_buffer_of_size(GPI::SegmentID id, std::size_t buffer_size) + { + if(segments.find(id) == segments.end()) + { + throw std::runtime_error("[SegmentManager::get_buffer_of_size]:\ + Segment not allocated"); + } + + auto& segment = segments.at(id).segment; + auto const current_offset = segments.at(id).current_offset; + if(current_offset + buffer_size > segment->get_size()) + { + throw std::runtime_error("[SegmentManager::get_buffer_of_size]:\ + Out of memory"); + } + + SegmentBuffer const segmentBuffer(*segment, current_offset, buffer_size); + segments.at(id).current_offset = current_offset + buffer_size; + return segmentBuffer; + } + } +} diff --git a/src/gpi_comm_lib/gpi/SegmentManager.hpp b/src/gpi_comm_lib/gpi/SegmentManager.hpp new file mode 100644 index 00000000..67945e14 --- /dev/null +++ b/src/gpi_comm_lib/gpi/SegmentManager.hpp @@ -0,0 +1,47 @@ +#pragma once + +#include "Context.hpp" +#include "Segment.hpp" +#include "SegmentBuffer.hpp" + +#include +#include +#include + +namespace tarantella +{ + namespace GPI + { + class SegmentManager + { + public: + SegmentManager(GPI::Context&); + SegmentManager() = delete; + SegmentManager(SegmentManager const&) = delete; + SegmentManager& operator=(SegmentManager const&) = delete; + ~SegmentManager() = default; + + void create_segment(GPI::SegmentID, GPI::Group const&, std::size_t); + SegmentBuffer get_buffer_of_size(GPI::SegmentID, std::size_t); + + private: + class AllocatedSegment + { + public: + AllocatedSegment(GPI::Context& context, GPI::Group const& group, + GPI::SegmentID id, std::size_t size, std::size_t offset) + : segment(std::make_unique(context, group, id, size)), + current_offset(offset) + {} + AllocatedSegment(AllocatedSegment&&) = default; + AllocatedSegment& operator=(AllocatedSegment&&) = default; + + std::unique_ptr segment; + std::size_t current_offset; + }; + + GPI::Context& context; + std::unordered_map segments; + }; + } +} diff --git a/src/gpi_comm_lib/gpi/Types.hpp b/src/gpi_comm_lib/gpi/Types.hpp new file mode 100644 index 00000000..bd331258 --- /dev/null +++ b/src/gpi_comm_lib/gpi/Types.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include +#include + +#include + +namespace tarantella +{ + namespace GPI + { + using Rank = short unsigned int; + using SegmentID = unsigned char; + using GroupID = unsigned long; + + using NotificationID = std::size_t; + using NotificationRange = std::pair; + using QueueID = gaspi_queue_id_t; + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/gpi/gaspiCheckReturn.cpp b/src/gpi_comm_lib/gpi/gaspiCheckReturn.cpp new file mode 100755 index 00000000..a7a6139f --- /dev/null +++ b/src/gpi_comm_lib/gpi/gaspiCheckReturn.cpp @@ -0,0 +1,24 @@ +#include "gaspiCheckReturn.hpp" + +#include +#include +#include + +namespace tarantella +{ + namespace GPI + { + void gaspiCheckReturn(const gaspi_return_t err, + const std::string prefix) + { + if (err != GASPI_SUCCESS) + { + gaspi_string_t raw; + gaspi_print_error(err, &raw); + std::string message = prefix + std::string(raw); + free(raw); + throw std::runtime_error(message); + } + } + } +} diff --git a/src/gpi_comm_lib/gpi/gaspiCheckReturn.hpp b/src/gpi_comm_lib/gpi/gaspiCheckReturn.hpp new file mode 100755 index 00000000..4cd4cc35 --- /dev/null +++ b/src/gpi_comm_lib/gpi/gaspiCheckReturn.hpp @@ -0,0 +1,16 @@ +#pragma once + +#include + +#include +#include +#include + +namespace tarantella +{ + namespace GPI + { + void gaspiCheckReturn(const gaspi_return_t err, + const std::string prefix); + } +} \ No newline at end of file diff --git a/src/gpi_comm_lib/pybind11_wrappers.cpp b/src/gpi_comm_lib/pybind11_wrappers.cpp new file mode 100644 index 00000000..c1b85ae9 --- /dev/null +++ b/src/gpi_comm_lib/pybind11_wrappers.cpp @@ -0,0 +1,155 @@ +#include "collectives/BufferElementType.hpp" +#include "collectives/TensorInfo.hpp" +#include "distribution/GroupBuilder.hpp" +#include "distribution/SegmentIDBuilder.hpp" +#include "gpi/Context.hpp" +#include "PipelineCommunicator.hpp" +#include "SynchCommunicator.hpp" +#include "TensorBroadcaster.hpp" + +#include +#include +#include + +#include +#include + +namespace py = pybind11; + +PYBIND11_MODULE(GPICommLib, m) +{ + m.doc() = "GPI communication library for Deep Learning"; + + py::class_(m, "GPIContext") + .def(py::init<>()) + .def_property_readonly("rank", &tarantella::GPI::Context::get_rank) + .def_property_readonly("size", &tarantella::GPI::Context::get_comm_size); + + py::class_(m, "TensorInfo") + .def(py::init( + [](std::size_t tensid, std::size_t nelems, py::dtype tensdtype) + { + tarantella::collectives::BufferElementType elemtype; + if (tensdtype.is(py::dtype::of())) + { + elemtype = tarantella::collectives::BufferElementType::FLOAT; + } + else if (tensdtype.is(py::dtype::of())) + { + elemtype = tarantella::collectives::BufferElementType::INT32; + } + else if (tensdtype.is(py::dtype::of())) + { + elemtype = tarantella::collectives::BufferElementType::INT16; + } + else + { + throw std::runtime_error("[Pybind11][TensorInfo] Unknown buffer type"); + } + return std::unique_ptr( + new tarantella::collectives::TensorInfo(tensid, nelems, elemtype)); + })); + + py::class_(m, "SynchDistCommunicator") + .def(py::init( + [](tarantella::GPI::Context& context, + std::vector tensor_infos, + std::size_t fusion_threshold_bytes) + { + tarantella::distribution::DataParallelGroupBuilder group_builder(context); + tarantella::distribution::DataParallelSegmentIDBuilder segment_id_builder{}; + + return std::unique_ptr( + new tarantella::SynchCommunicator(context, + segment_id_builder.get_segment_id(), + group_builder.get_group(), + tensor_infos, + fusion_threshold_bytes)); + }), + // ensure the `context` object is not garbage-collected as long as the SynchCommunicator is alive + py::keep_alive<1, 2>()) + .def("get_raw_ptr", [](tarantella::SynchCommunicator& d) + { + return reinterpret_cast(&d); + }, + py::return_value_policy::reference_internal); + + py::class_(m, "TensorBroadcaster") + .def(py::init( + [](tarantella::GPI::Context& context, + std::vector tensor_infos, + tarantella::GPI::Rank root_rank) + { + tarantella::distribution::DataParallelGroupBuilder group_builder(context); + tarantella::distribution::DataParallelSegmentIDBuilder segment_id_builder{}; + + return std::unique_ptr( + new tarantella::TensorBroadcaster(context, + segment_id_builder.get_segment_id(), + group_builder.get_group(), + tensor_infos, + root_rank)); + }), + py::keep_alive<1, 2>()) + .def("broadcast", + [](tarantella::TensorBroadcaster &tb, std::vector& tensor_list) + { + std::vector tensor_ptrs; + for (auto& tens : tensor_list) + { + py::buffer_info info = tens.request(); + tensor_ptrs.emplace_back(info.ptr); + } + tb.exec_broadcast(tensor_ptrs); + }); + + py::class_(m, "Barrier") + .def(py::init( + [](tarantella::GPI::Context&) + { + return std::unique_ptr( + new tarantella::collectives::Barrier::GPIBarrierAllRanks()); + }), + py::keep_alive<1, 2>()) + .def("blocking_barrier_all_ranks", + [](tarantella::collectives::Barrier::GPIBarrierAllRanks &barrier) + { + barrier.blocking_barrier(); + }); + + py::class_(m, "PipelineCommunicator") + .def(py::init( + [](tarantella::GPI::Context& context, + std::unordered_map, std::size_t>> edges, + std::size_t num_micro_batches) + { + auto const rank = context.get_rank(); + std::unordered_map conn_infos; + tarantella::distribution::PipelineSegmentIDBuilder segment_id_builder; + + // build connection info (segment_id, other rank, buffer_size) + // for each edge connected to the current rank + for (auto const& [conn_id, edge_and_size] : edges) + { + auto const ranks = edge_and_size.first; + if (ranks.first != rank && ranks.second != rank) continue; + + auto const other_rank = (ranks.first == rank) ? ranks.second : ranks.first; + auto const buffer_size = edge_and_size.second; + auto const segment_id = segment_id_builder.get_segment_id(conn_id); + tarantella::ConnectionInfo const conn_info(segment_id, other_rank, buffer_size); + conn_infos.emplace(conn_id, conn_info); + } + + return std::make_unique(context, conn_infos, num_micro_batches); + }), + // ensure the `context` object is not garbage-collected as long as the PipelineCommunicator is alive + py::keep_alive<1, 2>()) + .def("get_raw_ptr", [](tarantella::PipelineCommunicator& comm) + { + return reinterpret_cast(&comm); + }, + py::return_value_policy::reference_internal); +} diff --git a/src/gpi_comm_lib/tf_ops/AllreduceOps.cpp b/src/gpi_comm_lib/tf_ops/AllreduceOps.cpp new file mode 100644 index 00000000..74bf92ad --- /dev/null +++ b/src/gpi_comm_lib/tf_ops/AllreduceOps.cpp @@ -0,0 +1,125 @@ +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/op_kernel.h" + +#include "SynchCommunicator.hpp" + +using namespace tensorflow; + +REGISTER_OP("StartAllreduceOp") + .Attr("tnt_synchcomm: int") + .Attr("tensor_id: int") + .Input("input_tensor: float") + .Output("out_tensor: float") + .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) { + c->set_output(0, c->input(0)); + return Status::OK(); + }); +REGISTER_OP("FinishAllreduceOp") + .Attr("tnt_synchcomm: int") + .Attr("tensor_id: int") + .Attr("Tout: type") + .Input("input_tensor: float") + .Output("out_tensor: Tout") + .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) { + c->set_output(0, c->input(0)); + return Status::OK(); + }); +REGISTER_OP("BarrierOp") + .Attr("T: list(type)") + .Attr("Tout: list(type)") + .Input("in: T") + .Output("out: Tout") + .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) + { + for (auto i = 0; i < c->num_outputs(); ++i) + { + c->set_output(i, c->input(i)); + } + return Status::OK(); + }); + +template +class CommunicateTensorOp : public OpKernel +{ + public: + explicit CommunicateTensorOp(OpKernelConstruction* context) + : OpKernel(context) + { + tensorflow::int64 context_ptr; + OP_REQUIRES_OK(context, + context->GetAttr("tnt_synchcomm", &context_ptr)); + synch_communicator = reinterpret_cast(context_ptr); + OP_REQUIRES_OK(context, + context->GetAttr("tensor_id", &tensor_id)); + } + + void Compute(OpKernelContext* context) override + { + static_cast(*this).compute_impl(context); + } + + protected: + tensorflow::int64 tensor_id; + tarantella::SynchCommunicator *synch_communicator; +}; + +class StartAllreduceOp : public CommunicateTensorOp +{ + public: + explicit StartAllreduceOp(OpKernelConstruction* context) + : CommunicateTensorOp(context) + { } + + void compute_impl(OpKernelContext* context) + { + auto input_index = 0; + auto output_index = 0; + const Tensor &input_tensor = context->input(input_index); + auto* input_flat = input_tensor.flat().data(); + + synch_communicator->start_allreduce_impl(tensor_id, input_flat); + context->set_output(output_index, input_tensor); + } + +}; + +class FinishAllreduceOp : public CommunicateTensorOp +{ + public: + explicit FinishAllreduceOp(OpKernelConstruction* context) + : CommunicateTensorOp(context) + { } + + void compute_impl(OpKernelContext *context) + { + auto input_index = 0; + auto output_index = 0; + const Tensor &input_tensor = context->input(input_index); + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(output_index, input_tensor.shape(), + &output_tensor)); + auto* output_flat = output_tensor->flat().data(); + synch_communicator->finish_allreduce_impl(tensor_id, output_flat); + } +}; + +class BarrierOp : public OpKernel +{ + public: + explicit BarrierOp(OpKernelConstruction* context) + : OpKernel(context) + {} + + void Compute(OpKernelContext* context) override + { + for (auto i = 0; i < context->num_outputs(); ++i) + { + context->set_output(i, context->input(i)); + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("StartAllreduceOp").Device(DEVICE_CPU), StartAllreduceOp); +REGISTER_KERNEL_BUILDER(Name("FinishAllreduceOp").Device(DEVICE_CPU), FinishAllreduceOp); +REGISTER_KERNEL_BUILDER(Name("BarrierOp").Device(DEVICE_CPU), BarrierOp); \ No newline at end of file diff --git a/src/gpi_comm_lib/tf_ops/CMakeLists.txt b/src/gpi_comm_lib/tf_ops/CMakeLists.txt new file mode 100644 index 00000000..4f2c5a74 --- /dev/null +++ b/src/gpi_comm_lib/tf_ops/CMakeLists.txt @@ -0,0 +1,26 @@ + +set(TFOPS_SOURCES + ${SRC_DIR}/gpi_comm_lib/tf_ops/AllreduceOps.cpp + ${SRC_DIR}/gpi_comm_lib/tf_ops/SendRecvOps.cpp +) + +set(TFOPS_BUILD_DIR ${CMAKE_BINARY_DIR}/tnt_tfops) +set(TFOPS_LOADER_DIR + ${SRC_DIR}/gpi_comm_lib/tf_ops/tnt_tfops) + +add_custom_target(tfops-loader ALL + COMMAND ${CMAKE_COMMAND} -E copy_directory ${TFOPS_LOADER_DIR} ${TFOPS_BUILD_DIR}) + +extended_add_library(NAME tfops + NAMESPACE tnt + TYPE SHARED + SOURCES + ${TFOPS_SOURCES} + LIBRARIES + tnt::gpicommlib + Tensorflow::Tensorflow + INSTALL + INSTALL_DESTINATION + ${INSTALL_LIB_DIR} + POSITION_INDEPENDENT) +set_property(TARGET tnt-tfops PROPERTY CXX_STANDARD 14) diff --git a/src/gpi_comm_lib/tf_ops/SendRecvOps.cpp b/src/gpi_comm_lib/tf_ops/SendRecvOps.cpp new file mode 100644 index 00000000..ec5a5c1c --- /dev/null +++ b/src/gpi_comm_lib/tf_ops/SendRecvOps.cpp @@ -0,0 +1,103 @@ +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/op_kernel.h" + +#include "PipelineCommunicator.hpp" + +using namespace tensorflow; + +REGISTER_OP("SendOp") + .Attr("tnt_pipeline_comm: int") + .Input("input_tensor: float") + .Input("connection_id: int32") + .Input("micro_batch_id: int32") + .Output("out_tensor: float") + .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) + { + c->set_output(0, c->input(0)); + return Status::OK(); + }); +REGISTER_OP("RecvOp") + .Attr("tnt_pipeline_comm: int") + .Input("input_tensor: float") + .Input("connection_id: int32") + .Input("micro_batch_id: int32") + .Output("out_tensor: float") + .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) + { + c->set_output(0, c->input(0)); + return Status::OK(); + }); + +class SendOp : public OpKernel +{ + public: + explicit SendOp(OpKernelConstruction* context) + : OpKernel(context) + { + tensorflow::int64 context_ptr; + OP_REQUIRES_OK(context, context->GetAttr("tnt_pipeline_comm", &context_ptr)); + pipeline_communicator = reinterpret_cast(context_ptr); + } + + void Compute(OpKernelContext* context) override + { + const Tensor& input_tensor = context->input(0); + const Tensor& conn_id_tensor = context->input(1); + const Tensor& micro_batch_id_tensor = context->input(2); + + auto send_buf = reinterpret_cast(const_cast(input_tensor.flat().data())); + auto const conn_id = static_cast( + conn_id_tensor.flat().data()[0]); + auto const micro_batch_id = static_cast( + micro_batch_id_tensor.flat().data()[0]); + + // allocate (fake) output + auto const output_index = 0; + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(output_index, input_tensor.shape(), &output_tensor)); + + pipeline_communicator->non_blocking_send(send_buf, conn_id, micro_batch_id); + } + + private: + tarantella::PipelineCommunicator *pipeline_communicator; +}; + +class RecvOp : public OpKernel +{ + public: + explicit RecvOp(OpKernelConstruction* context) + : OpKernel(context) + { + tensorflow::int64 context_ptr; + OP_REQUIRES_OK(context, context->GetAttr("tnt_pipeline_comm", &context_ptr)); + pipeline_communicator = reinterpret_cast(context_ptr); + } + + void Compute(OpKernelContext* context) override + { + const Tensor& input_tensor = context->input(0); + const Tensor& conn_id_tensor = context->input(1); + const Tensor& micro_batch_id_tensor = context->input(2); + + auto const conn_id = static_cast( + conn_id_tensor.flat().data()[0]); + auto const micro_batch_id = static_cast( + micro_batch_id_tensor.flat().data()[0]); + + // allocate output + auto const output_index = 0; + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(output_index, input_tensor.shape(), &output_tensor)); + + auto* recv_buf = output_tensor->flat().data(); + pipeline_communicator->blocking_recv(recv_buf, conn_id, micro_batch_id); + } + + private: + tarantella::PipelineCommunicator *pipeline_communicator; +}; + +REGISTER_KERNEL_BUILDER(Name("SendOp").Device(DEVICE_CPU), SendOp); +REGISTER_KERNEL_BUILDER(Name("RecvOp").Device(DEVICE_CPU), RecvOp); \ No newline at end of file diff --git a/src/gpi_comm_lib/tf_ops/tnt_tfops/__init__.py b/src/gpi_comm_lib/tf_ops/tnt_tfops/__init__.py new file mode 100644 index 00000000..4ea68b87 --- /dev/null +++ b/src/gpi_comm_lib/tf_ops/tnt_tfops/__init__.py @@ -0,0 +1,6 @@ +import tensorflow as tf +import pathlib +import os + +tnt_ops = tf.load_op_library('libtnt-tfops.so') + diff --git a/src/runtime/__init__.py b/src/runtime/__init__.py new file mode 100644 index 00000000..ffbcbf98 --- /dev/null +++ b/src/runtime/__init__.py @@ -0,0 +1,2 @@ +import logging +logger = logging.getLogger("TNT_CLI") diff --git a/src/runtime/environment_config.py b/src/runtime/environment_config.py new file mode 100644 index 00000000..d66b2edc --- /dev/null +++ b/src/runtime/environment_config.py @@ -0,0 +1,51 @@ +import os +import sys + +import runtime.tnt_config as tnt_config +from runtime.tnt_config import TNTConfig + +def get_tnt_variables_from_args(args): + tnt_vars = {TNTConfig.TNT_LOG_LEVEL.name : args.log_level, + TNTConfig.TNT_LOG_ON_ALL_DEVICES.name : str(args.log_all), + TNTConfig.TNT_OUTPUT_ON_ALL_DEVICES.name : str(args.output_all)} + + if args.fusion_threshold_kb is not None: + tnt_vars[TNTConfig.TNT_FUSION_THRESHOLD.name] = int(args.fusion_threshold_kb) * 1024 + return tnt_vars + +def get_tnt_gpus(gpus_per_node): + return {TNTConfig.TNT_GPUS_PER_NODE.name : gpus_per_node} + +def update_environment_paths(libraries_path): + os.environ["PYTHONPATH"]=os.pathsep.join(sys.path) + + for var_name in ["LD_LIBRARY_PATH", "DYLD_LIBRARY_PATH"]: + os.environ[var_name] = os.pathsep.join([libraries_path, + os.environ.get(var_name, "")]) + +def collect_environment_variables(): + env = {} + for var in ['PATH', 'PYTHONPATH', 'LD_LIBRARY_PATH', 'DYLD_LIBRARY_PATH']: + if var in os.environ: + env[var] = os.environ[var] + return env + +def collect_tensorflow_variables(): + env = {} + for var, value in os.environ.items(): + if var.lower().startswith("tf_"): + env[var] = value + return env + +def collect_tarantella_variables(): + env = {} + for var, value in os.environ.items(): + if var.startswith(tnt_config.TARANTELLA_ENV_VAR_PREFIX): + env[var] = value + return env + +def gen_exports_from_dict(env_dict): + environment = "" + for var_name,value in env_dict.items(): + environment += "export {}={}\n".format(var_name, value) + return environment diff --git a/src/runtime/file_management.py b/src/runtime/file_management.py new file mode 100644 index 00000000..85eed66b --- /dev/null +++ b/src/runtime/file_management.py @@ -0,0 +1,61 @@ +from abc import ABCMeta, abstractmethod +import os +import stat +import tempfile + +def make_executable(filename): + os.chmod(filename, os.stat(filename).st_mode | stat.S_IXUSR) + +class TemporaryFileWrapper(metaclass = ABCMeta): + def __init__(self, dir = None, is_executable = False): + self.is_executable = is_executable + self.dir = dir + + def __enter__(self): + self.file_handle, self.filename = tempfile.mkstemp(dir = self.dir) + + with os.fdopen(self.file_handle, 'w') as f: + contents = self.get_initial_contents() + f.write(str(contents)) + + if self.is_executable: + make_executable(self.filename) + + def __exit__(self, *args): + os.remove(self.filename) + + @abstractmethod + def get_initial_contents(self): + raise NotImplementedError + + @property + def name(self): + return self.filename + + +class HostFile(TemporaryFileWrapper): + def __init__(self, nodes, devices_per_node): + super().__init__(is_executable = False) + + if not isinstance(nodes, list) or len(nodes) == 0: + raise LogicError("[create_nodes_file] Empty list of nodes provided") + if devices_per_node is None or devices_per_node <= 0: + raise LogicError("[create_nodes_file] Incorrect number of `devices_per_node`") + self.nodes = sorted(nodes) + self.devices_per_node = devices_per_node + + def get_initial_contents(self): + contents = "" + for node in self.nodes: + contents += '\n'.join([node] * self.devices_per_node) + '\n' + return contents + +class GPIScriptFile(TemporaryFileWrapper): + def __init__(self, header, environment, command, dir): + super().__init__(dir = dir, is_executable = True) + self.contents = [header, + environment, + command] + + def get_initial_contents(self): + return '\n'.join(self.contents) diff --git a/src/runtime/logging_config.py b/src/runtime/logging_config.py new file mode 100644 index 00000000..a86e3078 --- /dev/null +++ b/src/runtime/logging_config.py @@ -0,0 +1,40 @@ +import logging +import os + +from runtime import tf_config + +def setup_logging(logger, log_level, rank = 0, is_master_rank = True, + log_on_all_devices = False): + do_logging = log_on_all_devices + if not do_logging: + do_logging = is_master_rank + + if log_on_all_devices: + tnt_formatter_prefix = '[%(name)s] %(levelname)s: [rank %(rank)d] ' + else: + tnt_formatter_prefix = '[%(name)s] %(levelname)s: ' + + formatter = logging.Formatter(tnt_formatter_prefix + '%(pathname)s:%(lineno)d: %(message)s') + logger.setLevel(log_level) + logger.addFilter(TntLoggingFilter(rank, do_logging)) + + handler = logging.StreamHandler() + handler.setLevel(logger.level) + handler.setFormatter(formatter) + logger.addHandler(handler) + + tf_config.setup_logging(log_level = log_level, + formatter_prefix = tnt_formatter_prefix, + logging_filter = TntLoggingFilter(rank, do_logging)) + +class TntLoggingFilter(logging.Filter): + def __init__(self, rank, do_logging): + super().__init__() + self.rank = rank + self.do_logging = do_logging + + def filter(self, record): + record.rank = self.rank + if not self.do_logging: + return False + return True diff --git a/src/runtime/platform_config.py b/src/runtime/platform_config.py new file mode 100644 index 00000000..fdb7fa64 --- /dev/null +++ b/src/runtime/platform_config.py @@ -0,0 +1,63 @@ +import platform +import os + +import runtime.tf_config as tf_config +from runtime import logger + +def generate_nodes_list(hostfile = None): + if hostfile is None: + hostname = platform.node() + logger.debug("No `hostfile` provided. Using only the current node `{}`".format(hostname)) + return [hostname] + + if not os.path.isfile(hostfile): + raise ValueError("Incorrect `hostfile` provided with path `{}`".format(hostfile)) + + nodes_list = [] + try: + with open(hostfile, 'r') as f: + nodes_list = f.readlines() + except: + raise ValueError("Cannot read from `hostfile` with path `{}`".format(hostfile)) + + if len(nodes_list) == 0: + raise ValueError("Empty `hostfile` with path `{}`".format(hostfile)) + + unique_nodes = [node.strip() for node in set(nodes_list)] + if len(nodes_list) != len(set(nodes_list)): + logger.debug("The `hostfile` does not contain only unique hostnames; removing duplicates.") + return unique_nodes + + +def generate_num_gpus_per_node(npernode = None): + num_physical_gpus = len(tf_config.get_available_gpus()) + logger.debug("Num GPUs Available: {}".format(num_physical_gpus)) + + if npernode is None: # use as many GPUs as possible + num_devices = num_physical_gpus + + else: # the user requested a specific number of devices + if num_physical_gpus < npernode: + logger.debug("Not enough GPUs for the requested {} devices per node".format(npernode)) + num_devices = 0 + else: + num_devices = npernode + return num_devices + +def generate_num_devices_per_node(npernode = None, use_gpus = True): + num_gpus = 0 + if use_gpus: + num_gpus = generate_num_gpus_per_node(npernode) + + num_cpus = 0 + if num_gpus == 0: + if npernode is None: # use one rank per node + num_cpus = 1 + else: + num_cpus = npernode + + if use_gpus and num_gpus <= 0: + logger.warn("Cannot find {0} available GPUs per node as \ +requested; using {0} ranks on CPUs instead".format(num_cpus)) + + return num_gpus, num_cpus \ No newline at end of file diff --git a/src/runtime/tf_config.py b/src/runtime/tf_config.py new file mode 100644 index 00000000..224152c4 --- /dev/null +++ b/src/runtime/tf_config.py @@ -0,0 +1,32 @@ +import logging +import os +import tensorflow as tf + +from runtime import environment_config + +def get_available_gpus(): + """ Checks whether there are GPUs available on the machine and assigns one + to the current rank. + """ + phys_gpus = tf.config.experimental.list_physical_devices('GPU') + if phys_gpus is None: + phys_gpus = [] + return phys_gpus + + +_tf_logging_defaults = {'TF_CPP_MIN_LOG_LEVEL' : '3', + } + +def setup_logging(log_level, formatter_prefix, logging_filter): + tf_env = environment_config.collect_tensorflow_variables() + for var,value in _tf_logging_defaults.items(): + if not var in tf_env: + os.environ[var] = value + + tf_logger = tf.get_logger() + tf_logger.addFilter(logging_filter) + for h in tf_logger.handlers: + tf_logger_format = h.formatter._fmt.replace('%(levelname)s:', '').replace('%(name)s:','') + tf_logger_format = formatter_prefix + tf_logger_format + formatter = logging.Formatter(tf_logger_format) + h.setFormatter(formatter) diff --git a/src/runtime/tnt_config.py b/src/runtime/tnt_config.py new file mode 100644 index 00000000..4583c8e7 --- /dev/null +++ b/src/runtime/tnt_config.py @@ -0,0 +1,82 @@ +import enum +import os + +TARANTELLA_ENV_VAR_PREFIX = "TNT_" + +class TNTConfig(enum.Enum): + TNT_GPUS_PER_NODE = 'TNT_GPUS_PER_NODE' + TNT_OUTPUT_ON_ALL_DEVICES = 'TNT_OUTPUT_ON_ALL_DEVICES' + TNT_LOG_ON_ALL_DEVICES = 'TNT_LOG_ON_ALL_DEVICES' + TNT_TENSORBOARD_ON_ALL_DEVICES = 'TNT_TENSORBOARD_ON_ALL_DEVICES' + TNT_LOG_DIR = 'TNT_LOG_DIR' + TNT_LOG_LEVEL = 'TNT_LOG_LEVEL' + TNT_FUSION_THRESHOLD = 'TNT_FUSION_THRESHOLD' + +class TarantellaConfigurationDefaults: + @classmethod + def config(self): + default_config = { TNTConfig.TNT_GPUS_PER_NODE : None, + TNTConfig.TNT_FUSION_THRESHOLD : 32 * 1024, + TNTConfig.TNT_OUTPUT_ON_ALL_DEVICES : 'False', + TNTConfig.TNT_LOG_ON_ALL_DEVICES : 'False', + TNTConfig.TNT_TENSORBOARD_ON_ALL_DEVICES : "False", + TNTConfig.TNT_LOG_DIR : None, + TNTConfig.TNT_LOG_LEVEL : "WARN", + } + return default_config + +def get_configuration_from_env(filter_prefix = None): + config = dict() + for key in os.environ: + if filter_prefix is None: + config[key] = os.environ[key] + else: + if key.startswith(filter_prefix): + config[key] = os.environ[key] + return config + +class TarantellaConfiguration: + def __init__(self): + self.tarantella_env_prefix = TARANTELLA_ENV_VAR_PREFIX + self.config = get_configuration_from_env(self.tarantella_env_prefix) + + def get_variable_or_default(self, variable_name): + env_var_name = TNTConfig(variable_name).name + value = self.config.get(env_var_name) + if value is None: + value = TarantellaConfigurationDefaults.config().get(variable_name) + return value + + @property + def gpus_per_node(self): + gpus_per_node_string = self.get_variable_or_default(TNTConfig.TNT_GPUS_PER_NODE) + if gpus_per_node_string is None: + return None + return int(gpus_per_node_string) + + @property + def output_on_all_devices(self): + value_string = self.get_variable_or_default(TNTConfig.TNT_OUTPUT_ON_ALL_DEVICES) + return value_string.lower() == 'true' + + @property + def log_on_all_devices(self): + value_string = self.get_variable_or_default(TNTConfig.TNT_LOG_ON_ALL_DEVICES) + return value_string.lower() == 'true' + + @property + def tensorboard_on_all_devices(self): + value_string = self.get_variable_or_default(TNTConfig.TNT_TENSORBOARD_ON_ALL_DEVICES) + return value_string.lower() == "true" + + @property + def log_dir(self): + return self.get_variable_or_default(TNTConfig.TNT_LOG_DIR) + + @property + def log_level(self): + return self.get_variable_or_default(TNTConfig.TNT_LOG_LEVEL) + + @property + def fusion_threshold(self): + return int(self.get_variable_or_default(TNTConfig.TNT_FUSION_THRESHOLD)) diff --git a/src/tarantella/__init__.py b/src/tarantella/__init__.py new file mode 100644 index 00000000..c79632fa --- /dev/null +++ b/src/tarantella/__init__.py @@ -0,0 +1,196 @@ +import numpy as np +import tensorflow as tf +import GPICommLib + +import runtime.tnt_config as tnt_config +global_context = None +global_tnt_config = tnt_config.TarantellaConfiguration() + +import logging +logger = logging.getLogger("TNT_LIB") + +import runtime.logging_config as logging_config +import runtime.tf_config as tf_config +from tarantella.model import Model +import tarantella.optimizers as optimizers +import tarantella.optimizers.synchronous_distributed_optimizer as distributed_optimizers +from tnt_tfops import tnt_ops +from tarantella import models + +import sys + +def setup_gpus(rank, ngpus = None): + """Checks whether there are GPUs available on the machine and assigns one + to the current rank. + + To make sure a specific GPU will be used by the current rank, TensorFlow is + configured so that this particular GPU is the only one visible. + A GPU is selected if its index within the list of available GPUs is equal to + (rank % ngpus). + This allocation assumes that all nodes are homogeneous and are configured with + the same number of processes (< ngpus). + + Args: + rank: int, rank of the current process + + ngpus: int value specifying the maximum number of GPUs per node that will + be used. + """ + if ngpus is None or ngpus <= 0: + # Disable all GPUs + tf.config.experimental.set_visible_devices([], 'GPU') + visible_gpus = tf.config.experimental.get_visible_devices('GPU') + if visible_gpus and len(visible_gpus) > 0: + sys.exit("ERROR: [rank {}] Could not disable GPUs: {} GPUs still visible".format( + rank, len(visible_gpus))) + else: # try to use `ngpus` per node + phys_gpus = tf_config.get_available_gpus() + if phys_gpus and len(phys_gpus) > 0: + target_gpu = rank % ngpus + if len(phys_gpus) < ngpus: + sys.exit("ERROR: rank {} cannot use GPU_id={} (only {} GPUs available)".format( + rank, target_gpu, len(phys_gpus))) + + try: + # memory growth has to be set only once on all availble GPUs + if target_gpu == 0: + for gpu in phys_gpus: + tf.config.experimental.set_memory_growth(gpu, True) + # make sure only one GPU is visible per process + tf.config.experimental.set_visible_devices(phys_gpus[target_gpu], 'GPU') + except RuntimeError: + raise RuntimeError("[Tarantella][init] Cannot configure GPUs") + logger.debug("Using device: {}".format(tf.config.experimental.get_visible_devices())) + +def init(devices_per_node = None): + global global_context + if global_context is None: + global_context = GPICommLib.GPIContext() + + logging_config.setup_logging(logger, global_tnt_config.log_level, + get_rank(), is_master_rank(), + global_tnt_config.log_on_all_devices) + + # configure GPUs if a number of GPUs per node is specified, either as a parameter + # or as a `TNT_GPUS_PER_NODE` environment variable + if devices_per_node is None: + devices_per_node = global_tnt_config.gpus_per_node + setup_gpus(global_context.rank, ngpus = devices_per_node) + +def get_rank(): + return global_context.rank + +def get_master_rank(): + return 0 + +def is_master_rank(): + return get_rank() == get_master_rank() + +def get_size(): + return global_context.size + +def get_tensor_info(tensor_id, tensor): + return GPICommLib.TensorInfo(tensor_id, + int(np.prod(tensor.shape)), + np.dtype(tf.dtypes.as_dtype(tensor.dtype).as_numpy_dtype())) + +class TensorBroadcaster(): + def __init__(self, tensor_list, root_rank): + self.context = global_context + self.root_rank = root_rank + + tensor_infos = [get_tensor_info(tid, tensor) for tid, tensor in enumerate(tensor_list)] + self.broadcaster = GPICommLib.TensorBroadcaster(self.context, + tensor_infos, + self.root_rank) + + def broadcast(self, tensor_list): + self.broadcaster.broadcast(tensor_list) + +class Barrier(): + def __init__(self): + self.barrier = GPICommLib.Barrier(global_context) + + def synchronize(self): + self.barrier.blocking_barrier_all_ranks() + +class SynchCommunicator(): + def __init__(self, global_context): + self.context = global_context + self.weight_to_index = dict() + self.comm = None + self.threshold = global_tnt_config.fusion_threshold + + def setup_infrastructure(self, gradients_and_weights): + """ Setup state and allocate GPI segments + """ + # Define gradient IDs associated with each weight, indexed by the weights' names + # Assumption: the order in which the weights are provided is deterministic + # (based on the internal TF graph description), so that all ranks process the + # weights in the same order + running_grad_id = 0 + for grad, weight in gradients_and_weights: + self.weight_to_index[weight.name] = running_grad_id + running_grad_id += 1 + + # initialize the internal `SynchCommunicator` corresponding to the provided list of gradients + grad_infos = list() + for grad, weight in gradients_and_weights: + grad_infos.append(get_tensor_info(self.weight_to_index[weight.name], grad)) + self.comm = GPICommLib.SynchDistCommunicator(global_context, grad_infos, self.threshold) + + def reduce_gradients(self, gradients_and_weights): + gradients_to_reduce = list() + for grad, weight in gradients_and_weights: + # add an Allreduce operation for each gradient + grad_id = self.weight_to_index[weight.name] + output_grad = tnt_ops.start_allreduce_op(grad, tensor_id = grad_id, + tnt_synchcomm = self.comm.get_raw_ptr()) + gradients_to_reduce.append(output_grad) + + # Create barrier op in the Tensorflow graph to make sure all + # the Allreduce operations on gradients have started. + # This ensures that the graph execution does not get delayed by waiting + # for gradients to be reduced as long as there are remaining computations + # in the backward pass. + temp_gradients = tnt_ops.barrier_op(gradients_to_reduce, + Tout = [tf.float32] * len(gradients_to_reduce)) + + # Add individual ops that wait for each gradient to be reduced before updating + # the weights. + # These ops are executed only after the backward pass has been completed. + reduced_gradients = list() + for idx, (_, weight) in enumerate(gradients_and_weights): + # gradient tensors obtained after barrier are listed in the same order + # as the initial `gradients_and_weights` + gradient = temp_gradients[idx] + grad_id = self.weight_to_index[weight.name] + + output_grad = tnt_ops.finish_allreduce_op(gradient, + tensor_id = grad_id, + Tout = tf.float32, + tnt_synchcomm = self.comm.get_raw_ptr()) + reduced_gradients.append(output_grad) + return reduced_gradients + + +class PipelineCommunicator: + def __init__(self, pipeline_comm): + # TODO: initialize pipeline communicator binding + self.pipeline_comm_ptr = pipeline_comm.get_raw_ptr() + pass + + def send(self, input, connection_id, micro_batch_id): + return tnt_ops.send_op(input, + connection_id = connection_id, + micro_batch_id = micro_batch_id, + tnt_pipeline_comm = self.pipeline_comm_ptr) + + + def recv(self, input, connection_id, micro_batch_id, output_shape): + return tnt_ops.recv_op(input, + connection_id = connection_id, + micro_batch_id = micro_batch_id, + tnt_pipeline_comm = self.pipeline_comm_ptr, + output_shape = output_shape) + diff --git a/src/tarantella/datasets/dataset_helpers.py b/src/tarantella/datasets/dataset_helpers.py new file mode 100644 index 00000000..f245a3fa --- /dev/null +++ b/src/tarantella/datasets/dataset_helpers.py @@ -0,0 +1,259 @@ +import copy + +import tensorflow as tf +from tensorflow.python.data.ops import dataset_ops as ds +from tensorflow.python.framework import ops +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import gen_dataset_ops + +from tarantella import logger +import tarantella.datasets.ops as tnt_ops + +def _get_transformation_info_batch(dataset): + kwargs = {"batch_size": dataset._batch_size, + "drop_remainder": dataset._drop_remainder} + return (ds.BatchDataset, kwargs) + +def _get_transformation_info_cache(dataset): + kwargs = {"filename": dataset._filename} + return (ds.CacheDataset, kwargs) + +def _get_transformation_info_concatenate(dataset): + kwargs = {"dataset_to_concatenate": dataset._dataset_to_concatenate} + return (ds.ConcatenateDataset, kwargs) + +def _get_transformation_info_filter(dataset): + kwargs = {"predicate": dataset._predicate} + return (tnt_ops.TntFilterDataset, kwargs) + +def _get_transformation_info_flatmap(dataset): + kwargs = {"map_func": dataset._map_func} + return (tnt_ops.TntFlatMapDataset, kwargs) + +def _get_transformation_info_interleave(dataset): + kwargs = {"map_func": dataset._map_func, + "cycle_length": dataset._cycle_length, + "block_length": dataset._block_length} + return (tnt_ops.TntInterleaveDataset, kwargs) + +def _get_transformation_info_map(dataset): + kwargs = {"map_func": dataset._map_func, + "use_inter_op_parallelism": dataset._use_inter_op_parallelism, + "preserve_cardinality": dataset._preserve_cardinality} + return (tnt_ops.TntMapDataset, kwargs) + +def _get_transformation_info_paddedbatch(dataset): + kwargs = {"batch_size": dataset._batch_size, + "padded_shapes": dataset._padded_shapes, + "padding_values": dataset._padding_values, + "drop_remainder": dataset._drop_remainder} + return (tnt_ops.TntPaddedBatchDataset, kwargs) + +def _get_transformation_info_parallelinterleave(dataset): + # bug in TF2.2: `deterministic` is not saved as an attribute + deterministic = "default" + if hasattr(dataset, '_deterministic'): + deterministic = dataset._deterministic + + kwargs = {"map_func": dataset._map_func, + "cycle_length": dataset._cycle_length, + "block_length": dataset._block_length, + "num_parallel_calls": dataset._num_parallel_calls, + "deterministic": deterministic} + + # support for TF2.0 - does not have `buffer_output_elements` + if hasattr(dataset, '_buffer_output_elements'): + kwargs['buffer_output_elements'] = dataset._buffer_output_elements + + if hasattr(dataset, '_prefetch_input_elements'): + kwargs['prefetch_input_elements'] = dataset._prefetch_input_elements + + return (tnt_ops.TntParallelInterleaveDataset, kwargs) + +def _get_transformation_info_parallelmap(dataset): + deterministic = "default" + if hasattr(dataset, '_deterministic'): + deterministic = dataset._deterministic + + kwargs = {"map_func": dataset._map_func, + "use_inter_op_parallelism": dataset._use_inter_op_parallelism, + "num_parallel_calls": dataset._num_parallel_calls, + "preserve_cardinality": dataset._preserve_cardinality, + "deterministic": deterministic} + return (tnt_ops.TntParallelMapDataset, kwargs) + +def _get_transformation_info_prefetch(dataset): + buffer_size = dataset._buffer_size + # TF2.2: https://github.com/tensorflow/tensorflow/blob/v2.2.0/tensorflow/python/data/ops/dataset_ops.py#L4255 + if buffer_size == -1: + buffer_size = None + kwargs = {"buffer_size" : dataset._buffer_size} + return (ds.PrefetchDataset, kwargs) + +def _get_transformation_info_repeat(dataset): + count = dataset._count + if count == -1: + count = None + kwargs = {"count": count} + return (ds.RepeatDataset, kwargs) + +def _get_transformation_info_shard(dataset): + kwargs = {"num_shards": dataset._num_shards, + "index": dataset._index} + return (ds.ShardDataset, kwargs) + +def _get_transformation_info_shuffle(dataset): + # TF 2.0 - 2.2 + # ShuffleDataset does not save the given seed + # instead it has two seed properties defined as + # `self._seed, self._seed2 = random_seed.get_seed(seed)` + # with `get_seed` defined in `tensorflow/python/framework/random_seed.py` [TF2.2] + if dataset._seed2 == 0: + # there was no seed specified by the user + seed = None + else: + seed = dataset._seed2 + kwargs = {"buffer_size": dataset._buffer_size, + "seed": seed, + "reshuffle_each_iteration": dataset._reshuffle_each_iteration} + return (ds.ShuffleDataset, kwargs) + +def _get_transformation_info_skip(dataset): + kwargs = {"count": dataset._count} + return (ds.SkipDataset, kwargs) + +def _get_transformation_info_take(dataset): + kwargs = {"count": dataset._count} + return (ds.TakeDataset, kwargs) + +def _get_transformation_info_unbatch(dataset): + kwargs = {} + return (ds._UnbatchDataset, kwargs) + +def _get_transformation_info_window(dataset): + kwargs = {"size": dataset._size, + "shift": dataset._shift, + "stride": dataset._stride, + "drop_remainder": dataset._drop_remainder} + return (ds.WindowDataset, kwargs) + +def _get_transformation_info_withoptions(dataset): + kwargs = {"options": dataset._options} + return (ds._OptionsDataset, kwargs) + +_transformations = {ds.BatchDataset : _get_transformation_info_batch, + ds.CacheDataset : _get_transformation_info_cache, + ds.ConcatenateDataset: _get_transformation_info_concatenate, + ds.FilterDataset : _get_transformation_info_filter, + ds.FlatMapDataset : _get_transformation_info_flatmap, + ds.InterleaveDataset : _get_transformation_info_interleave, + ds.MapDataset : _get_transformation_info_map, + ds.PaddedBatchDataset : _get_transformation_info_paddedbatch, + ds.ParallelInterleaveDataset : _get_transformation_info_parallelinterleave, + ds.ParallelMapDataset : _get_transformation_info_parallelmap, + ds.PrefetchDataset : _get_transformation_info_prefetch, + ds.RepeatDataset : _get_transformation_info_repeat, + ds.ShardDataset : _get_transformation_info_shard, + ds.ShuffleDataset : _get_transformation_info_shuffle, + ds.SkipDataset : _get_transformation_info_skip, + ds.TakeDataset : _get_transformation_info_take, + ds._UnbatchDataset : _get_transformation_info_unbatch, + ds.WindowDataset : _get_transformation_info_window, + ds._OptionsDataset : _get_transformation_info_withoptions, + } + +def gen_dataset_transformations(dataset): + """Generate the list of transformations that has been applied to a dataset + Returns: tuple(original dataset, list of transformations) + """ + stack = [] + while (hasattr(dataset, '_input_dataset')): # Stops when the initial dataset is encountered, + # or a zipped/enumerated dataset is found + identified_transf = False + for transformation in _transformations: + if isinstance(dataset, transformation): + stack.append(_transformations[transformation](dataset)) + identified_transf = True + break + if not identified_transf: + raise RuntimeError("Unknown transformation provided: {}.".format(dataset._transformation_name)) + dataset = dataset._input_dataset + return (dataset, list(reversed(stack))) + + +class BatchingOpInfo: + def __init__(self, is_batched, last_batching_index = None, + transformation = None, params = None): + self._is_batched = is_batched + self._last_batching_index = last_batching_index + self._transformation = transformation + self.set_kwargs_properties(params) + + @property + def is_batched(self): + return self._is_batched + + @property + def batch_size(self): + return self._batch_size + + @property + def drop_remainder(self): + return self._drop_remainder + + def is_last_batching_transformation(self, index): + return index == self._last_batching_index + + def set_kwargs_properties(self, ds_kwargs): + if not self.is_batched: + return + ds_kwargs = ds_kwargs if isinstance(ds_kwargs, dict) else {} + + self._drop_remainder = None + if 'drop_remainder' in ds_kwargs: + self._drop_remainder = ds_kwargs.pop('drop_remainder') + + if not 'batch_size' in ds_kwargs: + raise KeyError("[DistributedDataset] Batch transformation defined without batch size") + self._batch_size = ds_kwargs.pop('batch_size') + self._additional_kwargs = ds_kwargs + + def apply(self, dataset, new_batch_size): + if not self.is_batched: + raise RuntimeError("[BatchingOpInfo] Cannot apply batching transformation: dataset is unbatched.") + kwargs = self._additional_kwargs + kwargs['batch_size'] = new_batch_size + kwargs['drop_remainder'] = self.drop_remainder + return self._transformation(dataset, **kwargs) + + +def get_batching_info(dataset_transformations): + last_batch_transf_index = None + for index, (transf, ds_kwargs) in enumerate(reversed(dataset_transformations)): + if transf in [ds.BatchDataset, tnt_ops.TntPaddedBatchDataset]: + last_batch_transf_index = len(dataset_transformations) - index - 1 + return BatchingOpInfo(is_batched = True, + last_batching_index = last_batch_transf_index, + transformation = transf, + params = copy.deepcopy(ds_kwargs)) + return BatchingOpInfo(is_batched = False) + + +def get_num_samples(dataset): + cardinality = tf.data.experimental.cardinality(dataset) + + if cardinality == tf.data.experimental.INFINITE_CARDINALITY: + logger.debug("Infinite dataset detected.") + return tf.data.experimental.INFINITE_CARDINALITY + + if cardinality != tf.data.experimental.UNKNOWN_CARDINALITY: + logger.debug("Dataset size is %d" % (cardinality.numpy())) + return cardinality.numpy() + + logger.debug("Unknown dataset size. Counting samples...") + dataset_size = 0 + for d in dataset: + dataset_size += 1 + logger.debug("Dataset size is %d" % (dataset_size)) + return dataset_size + diff --git a/src/tarantella/datasets/distributed_dataset.py b/src/tarantella/datasets/distributed_dataset.py new file mode 100644 index 00000000..00db4fce --- /dev/null +++ b/src/tarantella/datasets/distributed_dataset.py @@ -0,0 +1,114 @@ +import tensorflow as tf +from tensorflow.python.data.ops import dataset_ops as ds + +from tarantella import logger +import tarantella.datasets.dataset_helpers as ds_helpers + +class DistributedDataset: + def __init__(self, dataset, num_ranks, rank, shuffle_seed = 42): + self.num_ranks = num_ranks + self.rank = rank + self.shuffle_seed = shuffle_seed + + self.dataset = dataset + self.base_dataset, self.dataset_transformations = \ + ds_helpers.gen_dataset_transformations(dataset) + self.batching_info = ds_helpers.get_batching_info(self.dataset_transformations) + + def distribute_dataset_across_ranks(self, user_micro_batch_size = None, is_training = True): + dataset = self.base_dataset + + # Batched datsets: + # re-apply dataset transformations identically, except for batching & shuffling + for index, (transf, ds_kwargs) in enumerate(self.dataset_transformations): + # shuffle operation + if isinstance(transf(dataset, **ds_kwargs), ds.ShuffleDataset): + dataset = self.shuffle_with_seed(dataset, ds_kwargs) + + # batch operation (i.e., `batch` or `padded_batch`) + elif self.batching_info.is_last_batching_transformation(index): + batch_size = self.batching_info.batch_size + if user_micro_batch_size: + micro_batch_size = user_micro_batch_size + if micro_batch_size * self.num_ranks != batch_size: + raise ValueError("[DistributedDataset] micro batch size ({}) is not consistent \ +with batch size ({}) on number of devices used ({}).".format(micro_batch_size, batch_size, + self.num_ranks)) + else: + micro_batch_size = self.get_microbatch_size(batch_size) + + if is_training: + dataset = self.distributed_batch(dataset, + batch_size = batch_size, + micro_batch_size = micro_batch_size) + else: + # FIXME: distribute batch for `evaluate` and `predict` + dataset = self.batching_info.apply(dataset, new_batch_size = micro_batch_size) + + # other operations + else: + dataset = transf(dataset, **ds_kwargs) + + # Unbatched datasets + if self.batching_info.is_batched == False: + if is_training == False: # outside `fit` + if user_micro_batch_size: + dataset = self.batching_info.apply(dataset, new_batch_size = micro_batch_size) + else: + dataset = self.batching_info.apply(dataset, new_batch_size = 1) + + if is_training == True: # inside `fit` + if user_micro_batch_size: + micro_batch_size = user_micro_batch_size + batch_size = micro_batch_size * self.num_ranks + dataset = self.distributed_batch(dataset, + batch_size = batch_size, + micro_batch_size = micro_batch_size) + else: + raise ValueError("[DistributedDataset] Unbatched datasets without tnt_micro_batch_size are not supported") + + return dataset + + def shuffle_with_seed(self, dataset, ds_kwargs): + if not 'seed' in ds_kwargs or ds_kwargs['seed'] is None: + logger.warn("Shuffling with fixed shuffle seed {}.".format(self.shuffle_seed)) + ds_kwargs['seed'] = self.shuffle_seed + else: + logger.debug("Shuffling with shuffle seed {}.".format(ds_kwargs['seed'])) + return dataset.shuffle(**ds_kwargs) + + def distributed_batch(self, dataset, batch_size, micro_batch_size): + if self.batching_info.drop_remainder == True: + dataset = self.batching_info.apply(dataset, new_batch_size = batch_size) + dataset = dataset.unbatch() + + else: # no drop remainder + num_samples = ds_helpers.get_num_samples(dataset) + if num_samples == tf.data.experimental.INFINITE_CARDINALITY: + raise ValueError("[DistributedDataset] Infinite dataset provided") + + # Total number of samples is not multiple of the batch size + if num_samples % batch_size != 0: + logger.warn("Number of samples ({}) is not a multiple of batch size.\ + Removing the last incomplete batch from the dataset.".format(num_samples)) + num_samples_multiple = (num_samples // batch_size) * batch_size + dataset = dataset.take(num_samples_multiple) + + dataset = self.batching_info.apply(dataset, new_batch_size = micro_batch_size) + dataset = dataset.shard(num_shards=self.num_ranks, index = self.rank) + + logger.info("Using batch size = {}, micro batch size = {}.".format( + batch_size, micro_batch_size)) + return dataset + + def get_microbatch_size(self, batch_size): + if batch_size is None or batch_size == 0: + raise ValueError("[DistributedDataset]Incorrectly defined batch size") + + if batch_size % self.num_ranks != 0: + raise ValueError("[DistributedDataset] Batch size ({}) is not a multiple".format(batch_size) + + "of the number of ranks {}".format(self.num_ranks)) + + logger.debug("Batch size ({}) is a multiple of the number of ranks {}.".format( + batch_size, self.num_ranks)) + return int(batch_size // self.num_ranks) diff --git a/src/tarantella/datasets/ops/__init__.py b/src/tarantella/datasets/ops/__init__.py new file mode 100644 index 00000000..96e09150 --- /dev/null +++ b/src/tarantella/datasets/ops/__init__.py @@ -0,0 +1,7 @@ +from tarantella.datasets.ops.tnt_filter import TntFilterDataset +from tarantella.datasets.ops.tnt_flatmap import TntFlatMapDataset +from tarantella.datasets.ops.tnt_interleave import TntInterleaveDataset +from tarantella.datasets.ops.tnt_map import TntMapDataset +from tarantella.datasets.ops.tnt_parallel_interleave import TntParallelInterleaveDataset +from tarantella.datasets.ops.tnt_parallel_map import TntParallelMapDataset +from tarantella.datasets.ops.tnt_padded_batch import TntPaddedBatchDataset \ No newline at end of file diff --git a/src/tarantella/datasets/ops/tnt_filter.py b/src/tarantella/datasets/ops/tnt_filter.py new file mode 100644 index 00000000..8fb7f74d --- /dev/null +++ b/src/tarantella/datasets/ops/tnt_filter.py @@ -0,0 +1,26 @@ +import tensorflow as tf +from tensorflow.python.data.ops import dataset_ops as ds +from tensorflow.python.ops import gen_dataset_ops + +class TntFilterDataset(ds.UnaryUnchangedStructureDataset): + """A `Dataset` that filters its input according to a predicate function.""" + def __init__(self, + input_dataset, + predicate, + use_legacy_function=False): + """See `Dataset.filter()` for details.""" + self._input_dataset = input_dataset + self._predicate = predicate # StructuredFunctionWrapper + + variant_tensor = gen_dataset_ops.filter_dataset( + input_dataset._variant_tensor, # pylint: disable=protected-access + other_arguments=self._predicate.function.captured_inputs, + predicate=self._predicate.function, + **self._flat_structure) + super(TntFilterDataset, self).__init__(input_dataset, variant_tensor) + + def _functions(self): + return [self._map_func] + + def _transformation_name(self): + return "Dataset.filter()" diff --git a/src/tarantella/datasets/ops/tnt_flatmap.py b/src/tarantella/datasets/ops/tnt_flatmap.py new file mode 100644 index 00000000..794045c2 --- /dev/null +++ b/src/tarantella/datasets/ops/tnt_flatmap.py @@ -0,0 +1,29 @@ +import tensorflow as tf +from tensorflow.python.data.ops import dataset_ops as ds +from tensorflow.python.ops import gen_dataset_ops + +class TntFlatMapDataset(ds.UnaryDataset): + """A `Dataset` that maps a function over the elements in its input and flattens the result.""" + def __init__(self, + input_dataset, + map_func): + """See `Dataset.flat_map()` for details.""" + self._input_dataset = input_dataset + self._map_func = map_func # StructuredFunctionWrapper + + variant_tensor = gen_dataset_ops.flat_map_dataset( + self._input_dataset._variant_tensor, # pylint: disable=protected-access + self._map_func.function.captured_inputs, + f=self._map_func.function, + **self._flat_structure) + super(TntFlatMapDataset, self).__init__(input_dataset, variant_tensor) + + def _functions(self): + return [self._map_func] + + @property + def element_spec(self): + return self._map_func.output_structure._element_spec + + def _transformation_name(self): + return "Dataset.flat_map()" diff --git a/src/tarantella/datasets/ops/tnt_interleave.py b/src/tarantella/datasets/ops/tnt_interleave.py new file mode 100644 index 00000000..2eb153f8 --- /dev/null +++ b/src/tarantella/datasets/ops/tnt_interleave.py @@ -0,0 +1,35 @@ +import tensorflow as tf +from tensorflow.python.data.ops import dataset_ops as ds +from tensorflow.python.ops import gen_dataset_ops + +class TntInterleaveDataset(ds.UnaryDataset): + """A `Dataset` that interleaves the result of transformed inputs.""" + def __init__(self, + input_dataset, + map_func, + cycle_length, + block_length): + """See `Dataset.interleave()` for details.""" + self._input_dataset = input_dataset + self._map_func = map_func # StructuredFunctionWrapper + self._cycle_length = cycle_length + self._block_length = block_length + + variant_tensor = gen_dataset_ops.interleave_dataset( + input_dataset._variant_tensor, # pylint: disable=protected-access + self._map_func.function.captured_inputs, # pylint: disable=protected-access + self._cycle_length, + self._block_length, + f=self._map_func.function, + **self._flat_structure) + super(TntInterleaveDataset, self).__init__(input_dataset, variant_tensor) + + def _functions(self): + return [self._map_func] + + @property + def element_spec(self): + return self._map_func.output_structure._element_spec + + def _transformation_name(self): + return "Dataset.interleave()" diff --git a/src/tarantella/datasets/ops/tnt_map.py b/src/tarantella/datasets/ops/tnt_map.py new file mode 100644 index 00000000..642dfda9 --- /dev/null +++ b/src/tarantella/datasets/ops/tnt_map.py @@ -0,0 +1,36 @@ +import tensorflow as tf +from tensorflow.python.data.ops import dataset_ops as ds +from tensorflow.python.ops import gen_dataset_ops + +class TntMapDataset(ds.UnaryDataset): + """A `Dataset` that maps a function over the elements in its input.""" + def __init__(self, + input_dataset, + map_func, + use_inter_op_parallelism=True, + preserve_cardinality=False, + use_legacy_function=False): + """See `Dataset.map()` for details.""" + self._input_dataset = input_dataset + self._use_inter_op_parallelism = use_inter_op_parallelism + self._preserve_cardinality = preserve_cardinality + self._map_func = map_func # StructuredFunctionWrapper + + variant_tensor = gen_dataset_ops.map_dataset( + input_dataset._variant_tensor, # pylint: disable=protected-access + self._map_func.function.captured_inputs, + f=self._map_func.function, + use_inter_op_parallelism=self._use_inter_op_parallelism, + preserve_cardinality=self._preserve_cardinality, + **self._flat_structure) + super(TntMapDataset, self).__init__(input_dataset, variant_tensor) + + def _functions(self): + return [self._map_func] + + @property + def element_spec(self): + return self._map_func.output_structure + + def _transformation_name(self): + return "Dataset.map()" diff --git a/src/tarantella/datasets/ops/tnt_padded_batch.py b/src/tarantella/datasets/ops/tnt_padded_batch.py new file mode 100644 index 00000000..1149d3d6 --- /dev/null +++ b/src/tarantella/datasets/ops/tnt_padded_batch.py @@ -0,0 +1,56 @@ +import tensorflow as tf +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import smart_cond +from tensorflow.python.framework import tensor_shape +from tensorflow.python.framework import tensor_util +from tensorflow.python.data.ops import dataset_ops as ds +from tensorflow.python.data.util import nest +from tensorflow.python.data.util import structure +from tensorflow.python.ops import gen_dataset_ops + +class TntPaddedBatchDataset(ds.UnaryDataset): + """A `Dataset` that batches and pads contiguous elements from its input.""" + def __init__(self, + input_dataset, + batch_size, + padded_shapes, + padding_values, + drop_remainder, + ): + """See `Dataset.batch()` for details.""" + self._input_dataset = input_dataset + self._batch_size = batch_size + self._padded_shapes = padded_shapes + self._padding_values = padding_values + self._drop_remainder = drop_remainder + + def _padded_shape_to_batch_shape(s): + return tensor_shape.TensorShape([ + tensor_util.constant_value(self._batch_size) + if smart_cond.smart_constant_value(self._drop_remainder) else None + ]).concatenate(tensor_util.constant_value_as_shape(s)) + + output_shapes = nest.map_structure( + _padded_shape_to_batch_shape, self._padded_shapes) + self._structure = structure.convert_legacy_structure( + ds.get_legacy_output_types(self._input_dataset), output_shapes, + ds.get_legacy_output_classes(self._input_dataset)) + + variant_tensor = gen_dataset_ops.padded_batch_dataset_v2( + input_dataset._variant_tensor, # pylint: disable=protected-access + batch_size=self._batch_size, + padded_shapes=[ ops.convert_to_tensor(s, dtype=dtypes.int64) + for s in nest.flatten(self._padded_shapes) + ], + padding_values=nest.flatten(self._padding_values), + drop_remainder=self._drop_remainder, + output_shapes=structure.get_flat_tensor_shapes(self._structure)) + super(TntPaddedBatchDataset, self).__init__(input_dataset, variant_tensor) + + @property + def element_spec(self): + return self._structure + + def _transformation_name(self): + return "Dataset.padded_batch()" \ No newline at end of file diff --git a/src/tarantella/datasets/ops/tnt_parallel_interleave.py b/src/tarantella/datasets/ops/tnt_parallel_interleave.py new file mode 100644 index 00000000..5e53d746 --- /dev/null +++ b/src/tarantella/datasets/ops/tnt_parallel_interleave.py @@ -0,0 +1,71 @@ +from tensorflow.python.data.ops import dataset_ops as ds +from tensorflow.python.framework import ops +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import gen_dataset_ops + +class TntParallelInterleaveDataset(ds.UnaryDataset): + """A `Dataset` that maps a function over its input and interleaves the result.""" + def __init__(self, + input_dataset, + map_func, + cycle_length, + block_length, + num_parallel_calls, + deterministic, + buffer_output_elements = None, # backward compatibility with TF2.0 + prefetch_input_elements = None): # backward compatibility with TF2.0 + """See `Dataset.interleave()` for details.""" + self._input_dataset = input_dataset + self._map_func = map_func # StructuredFunctionWrapper + + self._cycle_length = cycle_length + self._block_length = block_length + self._buffer_output_elements = buffer_output_elements + self._prefetch_input_elements = prefetch_input_elements + self._num_parallel_calls = num_parallel_calls + self._deterministic = deterministic + + if (buffer_output_elements and buffer_output_elements != ds.AUTOTUNE) or \ + (prefetch_input_elements and prefetch_input_elements != ds.AUTOTUNE): + variant_tensor = gen_dataset_ops.parallel_interleave_dataset_v4( + input_dataset._variant_tensor, # pylint: disable=protected-access + self._map_func.function.captured_inputs, # pylint: disable=protected-access + self._cycle_length, + self._block_length, + self._buffer_output_elements, + self._prefetch_input_elements, + self._num_parallel_calls, + f=self._map_func.function, + deterministic=deterministic, + **self._flat_structure) + elif deterministic != "default": + variant_tensor = gen_dataset_ops.parallel_interleave_dataset_v3( + input_dataset._variant_tensor, # pylint: disable=protected-access + self._map_func.function.captured_inputs, # pylint: disable=protected-access + self._cycle_length, + self._block_length, + self._num_parallel_calls, + f=self._map_func.function, + deterministic=deterministic_string, + **self._flat_structure) + else: + variant_tensor = gen_dataset_ops.parallel_interleave_dataset_v2( + input_dataset._variant_tensor, # pylint: disable=protected-access + self._map_func.function.captured_inputs, # pylint: disable=protected-access + self._cycle_length, + self._block_length, + self._num_parallel_calls, + f=self._map_func.function, + **self._flat_structure) + super(TntParallelInterleaveDataset, self).__init__( + input_dataset, variant_tensor) + + def _functions(self): + return [self._map_func] + + @property + def element_spec(self): + return self._map_func.output_structure._element_spec + + def _transformation_name(self): + return "Dataset.interleave()" diff --git a/src/tarantella/datasets/ops/tnt_parallel_map.py b/src/tarantella/datasets/ops/tnt_parallel_map.py new file mode 100644 index 00000000..9a881676 --- /dev/null +++ b/src/tarantella/datasets/ops/tnt_parallel_map.py @@ -0,0 +1,56 @@ +from tensorflow.python.compat import compat +from tensorflow.python.data.ops import dataset_ops as ds +from tensorflow.python.framework import ops +from tensorflow.python.framework import dtypes +from tensorflow.python.ops import gen_dataset_ops + +class TntParallelMapDataset(ds.UnaryDataset): + """A `Dataset` that maps a function over the elements in its input.""" + def __init__(self, + input_dataset, + map_func, + num_parallel_calls, + deterministic, + use_inter_op_parallelism, + preserve_cardinality, + use_legacy_function=False): + self._input_dataset = input_dataset + self._map_func = map_func # StructuredFunctionWrapper + self._deterministic = deterministic + self._use_inter_op_parallelism = use_inter_op_parallelism + self._preserve_cardinality = preserve_cardinality + + if not self._deterministic == "default" or compat.forward_compatible(2020, 3, 6): + self._num_parallel_calls = ops.convert_to_tensor( + num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls") + variant_tensor = gen_dataset_ops.parallel_map_dataset_v2( + input_dataset._variant_tensor, # pylint: disable=protected-access + self._map_func.function.captured_inputs, + f=self._map_func.function, + num_parallel_calls=self._num_parallel_calls, + deterministic=self._deterministic, + use_inter_op_parallelism=self._use_inter_op_parallelism, + preserve_cardinality=self._preserve_cardinality, + **self._flat_structure) + else: + self._num_parallel_calls = ops.convert_to_tensor( + num_parallel_calls, dtype=dtypes.int32, name="num_parallel_calls") + variant_tensor = gen_dataset_ops.parallel_map_dataset( + input_dataset._variant_tensor, # pylint: disable=protected-access + self._map_func.function.captured_inputs, + f=self._map_func.function, + num_parallel_calls=self._num_parallel_calls, + use_inter_op_parallelism=self._use_inter_op_parallelism, + preserve_cardinality=self._preserve_cardinality, + **self._flat_structure) + super(TntParallelMapDataset, self).__init__(input_dataset, variant_tensor) + + def _functions(self): + return [self._map_func] + + @property + def element_spec(self): + return self._map_func.output_structure + + def _transformation_name(self): + return "Dataset.map()" diff --git a/src/tarantella/model.py b/src/tarantella/model.py new file mode 100644 index 00000000..fd8a4967 --- /dev/null +++ b/src/tarantella/model.py @@ -0,0 +1,392 @@ +import tensorflow as tf +from tensorflow.python.data.ops import iterator_ops +from tensorflow.python.keras.engine import training_utils +from tensorflow.python.keras.callbacks import ModelCheckpoint + +import tarantella +import tarantella.optimizers.synchronous_distributed_optimizer as distributed_optimizers +import tarantella.datasets.distributed_dataset as ds +from tarantella import logger + +model_implemented_methods = ['model', 'rank', 'comm_size', + 'call', 'build', 'done_broadcast', 'set_weights', 'load_weights', + 'get_weights', '_broadcast_weights_if_necessary', '_broadcast_weights', + 'broadcaster', 'default_shuffle_seed', + 'orig_optimizer', 'orig_loss', 'orig_metrics', + 'orig_loss_weights', 'orig_sample_weight_mode', 'orig_weighted_metrics'] + +class Model(tf.keras.models.Model): + def __init__(self, model): + if not tarantella.global_context: + raise RuntimeError("""Cannot initialize a Model before the Tarantella library. + Please call "tarantella.init()" first. + """) + self.rank = tarantella.get_rank() + self.comm_size = tarantella.get_size() + + self.model = model + self.input_shapes = None + self.done_broadcast = False + self.compiled = False + self.broadcaster = None + self.barrier = tarantella.Barrier() + + self.orig_optimizer = None + self.orig_loss = None + self.orig_metrics = None + self.orig_loss_weights = None + self.orig_sample_weight_mode = None + self.orig_weighted_metrics = None + + self.dist_optimizer = None + self.default_shuffle_seed = 42 + + # support for TF 2.0 -- 2.3 + self.tf_default_verbose = {'fit' : 1, + 'evaluate' : 1, + 'predict' : 0, + } + + def call(self, inputs): + return self.model.call(inputs) + + def build(self, input_shape): + return self.model.build(input_shape) + + def __getattr__(self, name): + if name in model_implemented_methods or \ + 'model' not in self.__dict__: + return getattr(self.__dict__, name) + return getattr(self.__dict__['model'], name) + + def __setattr__(self, name, value): + if name in model_implemented_methods or \ + 'model' not in self.__dict__: + self.__dict__[name] = value + else: + setattr(self.__dict__['model'], name, value) + + def __delattr__(self, name): + if name in model_implemented_methods or \ + 'model' not in self.__dict__: + delattr(self.__dict__, name) + delattr(self.__dict__['model'], name) + + def compile(self, + optimizer='rmsprop', + loss=None, + metrics=None, + loss_weights=None, + sample_weight_mode=None, + weighted_metrics=None, + **kwargs): + self.done_broadcast = False + self.compiled = True + + # Store original parameters to save the model later + self.orig_optimizer = optimizer + self.orig_loss = loss + self.orig_metrics = metrics + self.orig_loss_weights = loss_weights + self.orig_sample_weight_mode = sample_weight_mode + self.orig_weighted_metrics = weighted_metrics + + self.dist_optimizer = tarantella.distributed_optimizers.SynchDistributedOptimizer(self.orig_optimizer) + return self.model.compile(optimizer = self.dist_optimizer, + loss = self.orig_loss, + metrics = self.orig_metrics, + loss_weights = self.orig_loss_weights, + sample_weight_mode = self.orig_sample_weight_mode, + weighted_metrics = self.orig_weighted_metrics, + **kwargs) + + def fit(self, + x = None, + y = None, + callbacks = None, + validation_data = None, + tnt_micro_batch_size = None, + tnt_validation_micro_batch_size = None, + tnt_distribute_dataset = True, + tnt_distribute_validation_dataset = True, + **kwargs): + self._setup_for_execution('fit', x, y, callbacks, kwargs) + + if tnt_distribute_dataset: + distributed_x = ds.DistributedDataset(dataset = x, + num_ranks = self.comm_size, + rank = self.rank, + shuffle_seed = self.default_shuffle_seed) + x = distributed_x.distribute_dataset_across_ranks( + user_micro_batch_size = tnt_micro_batch_size, + is_training = True) + else: + logger.info("Automatic dataset distribution is disabled." + "Make sure the dataset is sharded manually across ranks.") + + # Always switch off shuffling + kwargs["shuffle"] = False + + if validation_data: + if tnt_distribute_validation_dataset: + distributed_validation_data = ds.DistributedDataset(dataset = validation_data, + num_ranks = self.comm_size, + rank = self.rank, + shuffle_seed = self.default_shuffle_seed) + validation_data = distributed_validation_data.distribute_dataset_across_ranks( + user_micro_batch_size = tnt_validation_micro_batch_size, + is_training = False) + else: + logger.info("Automatic distribution for the validation dataset is disabled.") + + return self.model.fit(x, + validation_data = validation_data, + callbacks = callbacks, + **kwargs) + + def evaluate(self, + x = None, + y = None, + callbacks = None, + tnt_micro_batch_size = None, + tnt_distribute_dataset = True, + **kwargs): + self._setup_for_execution('evaluate', x, y, callbacks, kwargs) + + if tnt_distribute_dataset: + test_dataset = ds.DistributedDataset(dataset = x, + num_ranks = self.comm_size, + rank = self.rank, + shuffle_seed = self.default_shuffle_seed) + x = test_dataset.distribute_dataset_across_ranks( + user_micro_batch_size = tnt_micro_batch_size, + is_training = False) + else: + logger.info("Automatic dataset distribution is disabled.") + + return self.model.evaluate(x, callbacks = callbacks, **kwargs) + + def predict(self, + x = None, + callbacks = None, + tnt_micro_batch_size = None, + tnt_distribute_dataset = True, + **kwargs): + self._setup_for_execution('predict', x, None, callbacks, kwargs) + + if tnt_distribute_dataset: + test_dataset = ds.DistributedDataset(dataset = x, + num_ranks = self.comm_size, + rank = self.rank, + shuffle_seed = self.default_shuffle_seed) + x = test_dataset.distribute_dataset_across_ranks( + user_micro_batch_size = tnt_micro_batch_size, + is_training = False) + else: + logger.info("Automatic dataset distribution is disabled.") + return self.model.predict(x, callbacks = callbacks, **kwargs) + + def get_config(self): + return self.model.get_config() + + @classmethod + def from_config(cls, config): + keras_model = tf.keras.Model.from_config(config) + return cls(keras_model) + + def to_json(self, **kwargs): + return self.model.to_json(**kwargs) + + def to_yaml(self, **kwargs): + return self.model.to_yaml(**kwargs) + + def save_weights(self, filepath, tnt_save_all_devices = False, **kwargs): + if tnt_save_all_devices: + self.model.save_weights(filepath, **kwargs) + else: + if tarantella.is_master_rank(): + self.model.save_weights(filepath, **kwargs) + # make sure, every rank can load the model after function exit + self.barrier.synchronize() + + def load_weights(self, filepath, **kwargs): + # loaded weights from the same source will be identical on all ranks + self.done_broadcast = True + return self.model.load_weights(filepath = filepath, **kwargs) + + def set_weights(self, weights): + self.model.set_weights(weights) + self._broadcast_weights() + self.done_broadcast = True + + def get_weights(self): + if not self.model.built: + if not self.input_shapes: + raise RuntimeError("""Cannot get weights before initializition. + Please call "tnt.Model.build()" or "tnt.Model.fit()" first. + """) + self.model.build(self.input_shapes) + return self.model.get_weights() + + def save(self, filepath, tnt_save_all_devices = False, **kwargs): + if tnt_save_all_devices: + self._save(filepath, kwargs) + else: + if tarantella.is_master_rank(): + self._save(filepath, kwargs) + # make sure, every rank can load the model after function exit + self.barrier.synchronize() + + def _save(self, filepath, args_dict): + # 1. Re-compile underlying `Keras.model` w/ underlying optimizer + self.model.compile(optimizer = self.orig_optimizer, + loss = self.orig_loss, + metrics = self.orig_metrics, + loss_weights = self.orig_loss_weights, + sample_weight_mode = self.orig_sample_weight_mode, + weighted_metrics = self.orig_weighted_metrics) + + # 2. Save the model as `Keras.Model` with standard Keras optimizer + self.model.save(filepath = filepath, **args_dict) + + # 3. Re-compile the Tarantella Model + self.compile(optimizer = self.orig_optimizer, + loss = self.orig_loss, + metrics = self.orig_metrics, + loss_weights = self.orig_loss_weights, + sample_weight_mode = self.orig_sample_weight_mode, + weighted_metrics = self.orig_weighted_metrics) + + def summary(self, *args, **kwargs): + if tarantella.global_tnt_config.output_on_all_devices: + self.model.summary(*args, **kwargs) + else: + if tarantella.is_master_rank(): + self.model.summary(*args, **kwargs) + + def _setup_for_execution(self, exec_type, x, y, callbacks, args_dict): + self._assert_compile_has_been_called() + self._set_verbose_all_ranks(exec_type, args_dict) + self._validate_datasets(x, y) + self._validate_batch_size_argument(exec_type, args_dict) + self._set_input_shapes(x) + self._broadcast_weights_if_necessary() + self._preprocess_callbacks(callbacks) + + def _assert_compile_has_been_called(self): + if self.compiled == False: + raise RuntimeError("`tnt.Model` has to be compiled first " + "using `tnt.Model.compile`") + + def _set_verbose_all_ranks(self, exec_type, args_dict): + if not 'verbose' in args_dict: + args_dict['verbose'] = self.tf_default_verbose[exec_type] + if not tarantella.global_tnt_config.output_on_all_devices: + if not tarantella.is_master_rank(): + args_dict['verbose'] = 0 + + def _validate_datasets(self, x, y): + if not isinstance(x, tf.data.Dataset) or not y is None: + raise RuntimeError("tnt.Model only supports `tf.data.Dataset`", + "for `x` and `None` for y.") + + def _validate_batch_size_argument(self, exec_type, args_dict): + if 'batch_size' in args_dict: + raise KeyError("tnt.Model does not support `batch_size` argument in %s" % exec_type) + + if 'validation_batch_size' in args_dict and exec_type == 'fit': + raise KeyError("tnt.Model.fit does not support `validation_batch_size` argument") + + def _set_input_shapes(self, dataset): + if isinstance(dataset.element_spec, tf.TensorSpec): + self.input_shapes = dataset.element_spec.shape + elif isinstance(dataset.element_spec[0], tf.TensorSpec): # (input, outputs) + self.input_shapes = dataset.element_spec[0].shape + else: # ((input0, ..., input_n), outputs) + self.input_shapes = [elem_spec.shape for elem_spec in dataset.element_spec[0]] + + def _broadcast_weights_if_necessary(self): + if not self.done_broadcast: + self._broadcast_weights() + + def _broadcast_weights(self): + weights = self.get_weights() + + if not self.broadcaster: + self.broadcaster = tarantella.TensorBroadcaster(weights, + tarantella.get_master_rank()) + + self.broadcaster.broadcast(weights) + self.model.set_weights(weights) + + self.done_broadcast = True + + def _preprocess_callbacks(self, callbacks): + if callbacks is not None: + remove_tensorboard_index = None + + for index, callback in enumerate(callbacks): + if isinstance(callback, tf.keras.callbacks.ModelCheckpoint): + tnt_callback = TntModelCheckpoint(keras_model_checkpoint = callback, + underlying_optimizer = self.orig_optimizer, + distributed_optimizer = self.dist_optimizer) + callbacks[index] = tnt_callback + + elif isinstance(callback, tf.keras.callbacks.LearningRateScheduler): + if not tarantella.global_tnt_config.output_on_all_devices: + if not tarantella.is_master_rank(): + callback.verbose = 0 + + elif isinstance(callback, tf.keras.callbacks.TensorBoard): + if tarantella.global_tnt_config.tensorboard_on_all_devices: + callback.log_dir += '/rank_{}'.format(self.rank) + else: + if not tarantella.is_master_rank(): + remove_tensorboard_index = index + + if remove_tensorboard_index is not None: + del callbacks[remove_tensorboard_index] + + +class TntModelCheckpoint(tf.keras.callbacks.ModelCheckpoint): + def __init__(self, keras_model_checkpoint, underlying_optimizer, distributed_optimizer): + super(TntModelCheckpoint, self).__init__(keras_model_checkpoint.filepath) + self.underlying_optimizer = underlying_optimizer + self.distributed_optimizer = distributed_optimizer + + # set member variables from ModelCheckpoint instance + self.validation_data = keras_model_checkpoint.validation_data + self.model = keras_model_checkpoint.model + self._chief_worker_only = keras_model_checkpoint._chief_worker_only + self._supports_tf_logs = True + self.monitor = keras_model_checkpoint.monitor + self.filepath = keras_model_checkpoint.filepath + self.save_best_only = keras_model_checkpoint.save_best_only + self.save_weights_only = keras_model_checkpoint.save_weights_only + self.save_freq = keras_model_checkpoint.save_freq + self.epochs_since_last_save = keras_model_checkpoint.epochs_since_last_save + self._batches_seen_since_last_saving = keras_model_checkpoint._batches_seen_since_last_saving + self._last_batch_seen = 0 + self.load_weights_on_restart = keras_model_checkpoint.load_weights_on_restart + self.period = keras_model_checkpoint.period + self.monitor_op = keras_model_checkpoint.monitor_op + self.best = keras_model_checkpoint.best + + # only master rank should save and thus print messages + self.verbose = keras_model_checkpoint.verbose if tarantella.is_master_rank() else 0 + + def on_train_begin(self, logs=None): + # As of TF 2.3, this only uses `self.model.load_weights` + super().on_train_begin(logs) + + def on_train_batch_end(self, batch, logs=None): + # set the optimizer to the underlying to save a plain keras model + self.model.optimizer = self.underlying_optimizer + super().on_train_batch_end(batch, logs) + self.model.optimizer = self.distributed_optimizer + + def on_epoch_end(self, epoch, logs=None): + # set the optimizer to the underlying to save a plain keras model + self.model.optimizer = self.underlying_optimizer + super().on_epoch_end(epoch, logs) + self.model.optimizer = self.distributed_optimizer diff --git a/src/tarantella/models.py b/src/tarantella/models.py new file mode 100644 index 00000000..280126c2 --- /dev/null +++ b/src/tarantella/models.py @@ -0,0 +1,41 @@ +import tensorflow as tf +import tarantella as tnt +from tarantella import logger + +def save_model(model, filepath, **kwargs): + if isinstance(model, tnt.Model): + logger.info("save model from instance of tnt.Model") + elif isinstance(model, tf.keras.Model): + logger.info("save model from instance of tf.keras.Model") + else: + raise ValueError("[tnt.models.save_model] `model` needs to be either", + "a `tf.keras.Model`, or a `tnt.Model`") + model.save(filepath, **kwargs) + +def load_model(filepath, **kwargs): + keras_model = tf.keras.models.load_model(filepath, **kwargs) + # FIXME: compile tnt.Model before returning + return tnt.Model(keras_model) + +def model_from_config(config, **kwargs): + return tnt.Model.from_config(config) + +def model_from_json(json_string, **kwargs): + keras_model = tf.keras.models.model_from_json(json_string, **kwargs) + return tnt.Model(keras_model) + +def model_from_yaml(yaml_string, **kwargs): + keras_model = tf.keras.models.model_from_yaml(yaml_string, **kwargs) + return tnt.Model(keras_model) + +def clone_model(model, **kwargs): + if isinstance(model, tnt.Model): + keras_model = tf.keras.models.clone_model(model.model, **kwargs) + logger.info("clone model from instance of tnt.Model") + elif isinstance(model, tf.keras.Model): + keras_model = tf.keras.models.clone_model(model, **kwargs) + logger.info("clone model from instance of tf.keras.Model") + else: + raise ValueError("[tnt.models.clone_model] `model` needs to be either", + "a `tf.keras.Model`, or a `tnt.Model`") + return tnt.Model(keras_model) diff --git a/src/tarantella/optimizers/__init__.py b/src/tarantella/optimizers/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/tarantella/optimizers/__init__.py @@ -0,0 +1 @@ + diff --git a/src/tarantella/optimizers/optimizer_wrapper.py b/src/tarantella/optimizers/optimizer_wrapper.py new file mode 100644 index 00000000..3fb6ba79 --- /dev/null +++ b/src/tarantella/optimizers/optimizer_wrapper.py @@ -0,0 +1,35 @@ +import tensorflow as tf + +class OptimizerWrapper(tf.keras.optimizers.Optimizer): + def __init__(self, optimizer, name = None): + self.optimizer = optimizer + + # overwrite the name of the inner optimizer + if not name is None: + self._name = name + + def __getattr__(self, name): + return getattr(self.__dict__['optimizer'], name) + + def __setattr__(self, name, value): + if name in ('optimizer'): + self.__dict__[name] = value + else: + setattr(self.__dict__['optimizer'], name, value) + + def __delattr__(self, name): + delattr(self.__dict__['optimizer'], name) + + # implement the missing methods by forwarding them to the inner optimizer implementations + def _resource_apply_dense(self, *args, **kwargs): + return self.optimizer._resource_apply_dense(*args, **kwargs) + + def _resource_apply_sparse(self, *args, **kwargs): + raise NotImplementedError("[OptimizerWrapper] _resource_apply_sparse: Sparse tensors not supported.") + + def _create_slots(self, *args, **kwargs): + return self.optimizer._create_slots(*args, **kwargs) + + def get_config(self): + return self.optimizer.get_config() + diff --git a/src/tarantella/optimizers/synchronous_distributed_optimizer.py b/src/tarantella/optimizers/synchronous_distributed_optimizer.py new file mode 100644 index 00000000..09c417a7 --- /dev/null +++ b/src/tarantella/optimizers/synchronous_distributed_optimizer.py @@ -0,0 +1,41 @@ +import tensorflow as tf +import numpy as np + +import tarantella +import tarantella.optimizers.optimizer_wrapper as wrapper +from tnt_tfops import tnt_ops + +class SynchDistributedOptimizer(wrapper.OptimizerWrapper): + _HAS_AGGREGATE_GRAD = True + + def __init__(self, optimizer, name = None): + self.optimizer = optimizer + if name is None: + name = "SynchDistributedOptimizer" + super(self.__class__, self).__init__(optimizer, name = name) + + # add new attributes after the base object has been initialized + self.comm = tarantella.SynchCommunicator(tarantella.global_context) + self.initialized = False + + # customized gradient reduction method used by `keras.model.fit` + # cf. https://github.com/tensorflow/tensorflow/blob/b36436b087bd8e8701ef51718179037cccdfc26e/tensorflow/python/keras/engine/training.py#L2696 + def _aggregate_gradients(self, grads_and_vars): + grads_and_vars = list(grads_and_vars) + + # initialize the SynchCommunicator with gradient tensors + if not self.initialized: + self.comm.setup_infrastructure(grads_and_vars) + self.initialized = True + + reduced_gradients = self.comm.reduce_gradients(grads_and_vars) + return reduced_gradients + + # override gradient computation method used in TF2.0/2.1 + # to enable gradient reduction + def get_gradients(self, loss, params): + gradients_to_reduce = self.optimizer.get_gradients(loss, params) + + grads_and_vars = zip(gradients_to_reduce, params) + reduced_gradients = self._aggregate_gradients(grads_and_vars) + return reduced_gradients diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 00000000..a952e190 --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,34 @@ +include (add_test_wrappers) + +set (include_dirs ${CMAKE_SOURCE_DIR}/src/gpi_comm_lib + ${CMAKE_SOURCE_DIR}/src/gpi_comm_lib/collectives + ${CMAKE_SOURCE_DIR}/src/gpi_comm_lib/collectives/lib) + +set(CLEANUP_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/cleanup.sh) +set(CLEANUP_TEST_NAME gpi_cleanup) +add_test (NAME ${CLEANUP_TEST_NAME} COMMAND sh ${CLEANUP_SCRIPT}) +set_tests_properties(${CLEANUP_TEST_NAME} PROPERTIES FIXTURES_CLEANUP ${CLEANUP_TEST_NAME}) + +add_subdirectory(${CMAKE_SOURCE_DIR}/test/collectives) +add_subdirectory(gpi) +add_subdirectory(python) + +set(localranks_list 1 2 4 5 7) +tarantella_compile_and_generate_gpi_test(NAME SynchCommunicator + LOCALRANKS_LIST "${localranks_list}" + TIMEOUT 20 + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/SynchCommunicator.cpp + LIBRARIES tnt::gpicommlib + INCLUDE_DIRECTORIES ${include_dirs}) + +tarantella_compile_and_generate_gpi_test(NAME ResourceManager + LOCALRANKS_LIST "${localranks_list}" + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/ResourceManager.cpp + LIBRARIES tnt::gpicommlib + INCLUDE_DIRECTORIES ${include_dirs}) + +tarantella_compile_and_generate_test(NAME TensorFusor + TIMEOUT 20 + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/TensorFusor.cpp + LIBRARIES tnt::gpicommlib + INCLUDE_DIRECTORIES ${include_dirs}) diff --git a/test/GlobalContextFixture.hpp b/test/GlobalContextFixture.hpp new file mode 100644 index 00000000..6a8dfe94 --- /dev/null +++ b/test/GlobalContextFixture.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include "gpi/Context.hpp" + +namespace tarantella +{ + class GlobalContext + { + public: + + GlobalContext() + { + instance() = this; + } + static GlobalContext*& instance() + { + static GlobalContext* s_inst = 0; + return s_inst; + } + + tarantella::GPI::Context gpi_cont; + }; +} diff --git a/test/ResourceManager.cpp b/test/ResourceManager.cpp new file mode 100644 index 00000000..d902e932 --- /dev/null +++ b/test/ResourceManager.cpp @@ -0,0 +1,58 @@ +#include "GlobalContextFixture.hpp" +#include "gpi/ResourceManager.hpp" +#include "utilities.hpp" + +#include + +namespace std +{ + std::ostream& operator<< (std::ostream& os, tarantella::GPI::ResourceManager const&) + { + return os; + } +} + +namespace tarantella +{ + BOOST_GLOBAL_FIXTURE( GlobalContext ); + + BOOST_AUTO_TEST_SUITE(resourcemanager_unit) + BOOST_AUTO_TEST_CASE(resourcemanager_require_queue) + { + auto& context = GlobalContext::instance()->gpi_cont; + BOOST_REQUIRE_NO_THROW(context.get_resource_manager().get_queue_id_for_write_notify()); + } + + BOOST_AUTO_TEST_CASE(resourcemanager_require_group) + { + auto& context = GlobalContext::instance()->gpi_cont; + auto& resource_manager = context.get_resource_manager(); + BOOST_REQUIRE_NO_THROW(resource_manager.make_group(gen_group_ranks(context.get_comm_size()))); + } + + BOOST_AUTO_TEST_CASE(resourcemanager_require_notification) + { + auto& resource_manager = GlobalContext::instance()->gpi_cont.get_resource_manager(); + GPI::SegmentID segment_id = 1; + auto const num_ranks = GlobalContext::instance()->gpi_cont.get_comm_size(); + auto group_all = resource_manager.make_group(gen_group_ranks(num_ranks)); + + std::size_t segment_size = 10; + BOOST_REQUIRE_NO_THROW(resource_manager.make_segment_resources(segment_id, group_all, segment_size)); + BOOST_REQUIRE_NO_THROW(resource_manager.get_notification_range(segment_id, 2)); + } + + BOOST_AUTO_TEST_CASE(resourcemanager_require_segment_buffer) + { + auto& resource_manager = GlobalContext::instance()->gpi_cont.get_resource_manager(); + GPI::SegmentID segment_id = 2; + auto const num_ranks = GlobalContext::instance()->gpi_cont.get_comm_size(); + auto group_all = resource_manager.make_group(gen_group_ranks(num_ranks)); + + std::size_t segment_size = 10; + std::size_t buffer_size = 10; + BOOST_REQUIRE_NO_THROW(resource_manager.make_segment_resources(segment_id, group_all, segment_size)); + BOOST_REQUIRE_NO_THROW(resource_manager.get_buffer_of_size(segment_id, buffer_size)); + } + BOOST_AUTO_TEST_SUITE_END() +} diff --git a/test/SynchCommunicator.cpp b/test/SynchCommunicator.cpp new file mode 100644 index 00000000..3024bc10 --- /dev/null +++ b/test/SynchCommunicator.cpp @@ -0,0 +1,241 @@ +#include "BufferElementType.hpp" +#include "distribution/GroupBuilder.hpp" +#include "distribution/SegmentIDBuilder.hpp" +#include "GlobalContextFixture.hpp" +#include "SynchCommunicator.hpp" +#include "utilities.hpp" + +#include +#include +#include + +#include +#include + +using boost::test_tools::per_element; + +namespace tarantella +{ + BOOST_GLOBAL_FIXTURE( GlobalContext ); + float const epsilon_f(1e-6); + + std::vector> test_cases + { + { + // test case #1 + // (tensor_id, num_elements, element_type) + {1, 8, collectives::BufferElementType::FLOAT} + }, + { + // test case #2 + {5, 4 * 1000 , collectives::BufferElementType::FLOAT} + }, + { + // test case #3 + {42, 17, collectives::BufferElementType::FLOAT}, + {11, 23, collectives::BufferElementType::FLOAT}, + }, + { + // test case #4 + {1, 8, collectives::BufferElementType::FLOAT}, + {2, 8, collectives::BufferElementType::FLOAT}, + {3, 8, collectives::BufferElementType::FLOAT}, + {4, 8, collectives::BufferElementType::FLOAT}, + {5, 8, collectives::BufferElementType::FLOAT}, + {6, 9, collectives::BufferElementType::FLOAT}, + }, + { + // test case #5 + {1, 4123, collectives::BufferElementType::FLOAT}, + {2, 5000, collectives::BufferElementType::FLOAT}, + {3, 6122, collectives::BufferElementType::FLOAT}, + {4, 17, collectives::BufferElementType::FLOAT}, + {5, 8000, collectives::BufferElementType::FLOAT}, + {6, 9145, collectives::BufferElementType::FLOAT}, + }, + }; + + std::vector thresholds_bytes + { + 0UL, + 4UL, + 64UL, + 196, + 1024UL, + }; + + class SynchCommTestData + { + public: + + SynchCommTestData(std::vector const& tensor_infos, + collectives::Allreduce::Operator::ReductionOp op, + std::size_t threshold_bytes = 0UL) + : group_builder(GlobalContext::instance()->gpi_cont), + segment_id_builder(), + synch_comm(GlobalContext::instance()->gpi_cont, + segment_id_builder.get_segment_id(), + group_builder.get_group(), + tensor_infos, + threshold_bytes), + expected_output_bufs(tensor_infos.size()), + input_bufs(tensor_infos.size()), + op(op) + { + auto& context = GlobalContext::instance()->gpi_cont; + auto nranks = context.get_comm_size(); + auto rank = context.get_rank(); + + // generate data for each tensor and fill in the expected result after Allreduce + for (auto grad_idx = 0U; grad_idx < tensor_infos.size(); ++grad_idx) + { + ids.push_back(tensor_infos.at(grad_idx).get_id()); + + // create input buffers for Allreduce based on the buffer size specified in the test case + std::generate_n(std::back_inserter(input_bufs[grad_idx]), + tensor_infos.at(grad_idx).get_nelems(), + [&]() + { + // fill buffer with values based on element index and current rank + auto idx = input_bufs[grad_idx].size(); + return idx * (rank + 1); + }); + + // create expected result buffers and fill them according to the tested Allreduce operation + std::generate_n(std::back_inserter(expected_output_bufs[grad_idx]), + tensor_infos.at(grad_idx).get_nelems(), + [&]() + { + auto idx = expected_output_bufs[grad_idx].size(); + auto elem = -1.f; + switch (op) + { + case collectives::Allreduce::Operator::ReductionOp::SUM: + { + elem = idx * nranks * (nranks + 1.) / 2.; + break; + } + case collectives::Allreduce::Operator::ReductionOp::AVERAGE: + { + elem = idx * (nranks + 1.) / 2.; + break; + } + default: + { + throw std::runtime_error( + "[Test][SynchCommunicator] Unknown reduction operation"); + } + } + return elem; + }); + } + }; + + int get_index_for_id(tarantella::GradID id) + { + auto it = std::find(ids.begin(), ids.end(), id); + if (it == ids.end()) + { + throw std::invalid_argument("ID not found in the list of ids for the current test case"); + } + return distance(ids.begin(), it); + } + + distribution::DataParallelGroupBuilder group_builder; + distribution::DataParallelSegmentIDBuilder segment_id_builder; + tarantella::SynchCommunicator synch_comm; + std::vector > expected_output_bufs; + std::vector > input_bufs; + std::vector ids; + collectives::Allreduce::Operator::ReductionOp const op; + }; + + BOOST_AUTO_TEST_SUITE(synch_communicator_unit) + + BOOST_DATA_TEST_CASE(synch_comm_creation, test_cases, test_case) + { + distribution::DataParallelGroupBuilder group_builder(GlobalContext::instance()->gpi_cont); + distribution::DataParallelSegmentIDBuilder segment_id_builder{}; + + BOOST_REQUIRE_NO_THROW(SynchCommunicator synch_comm(GlobalContext::instance()->gpi_cont, + segment_id_builder.get_segment_id(), + group_builder.get_group(), + test_case)); + } + + BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_f)); + BOOST_DATA_TEST_CASE(synch_comm_serialized_allred, test_cases * thresholds_bytes, test_case, threshold) // Cartesian product + { + auto const op = collectives::Allreduce::Operator::ReductionOp::AVERAGE; + SynchCommTestData synch_comm_data(test_case, op, threshold); + + for (auto &id : synch_comm_data.ids) + { + auto input_buf = synch_comm_data.input_bufs.at(synch_comm_data.get_index_for_id(id)); + synch_comm_data.synch_comm.start_allreduce_impl(id, input_buf.data()); + } + + for (GradID &id : synch_comm_data.ids) + { + std::vector out_data(synch_comm_data.expected_output_bufs.at(synch_comm_data.get_index_for_id(id)).size()); + synch_comm_data.synch_comm.finish_allreduce_impl(id, out_data.data()); + BOOST_TEST_REQUIRE(out_data == synch_comm_data.expected_output_bufs.at(synch_comm_data.get_index_for_id(id)), per_element()); + } + } + + namespace + { + void execute_iteration(SynchCommTestData& synch_comm_data) + { + std::vector> futures; + + // create multiple allreduce calls in parallel + for (auto &id : synch_comm_data.ids) + { + futures.emplace_back(std::async( + std::launch::async, + [&synch_comm_data](const GradID id) -> GradID { + auto input_buf = synch_comm_data.input_bufs.at(synch_comm_data.get_index_for_id(id)); + synch_comm_data.synch_comm.start_allreduce_impl(id, input_buf.data()); + return id; + }, + id)); + } + // wait for all allreduce operations to be submitted + for (auto &f : futures) + { + f.get(); + } + // wait for the execution of each allreduce to end and verify result + for (auto& id : synch_comm_data.ids) + { + std::vector out_data(synch_comm_data.expected_output_bufs.at(synch_comm_data.get_index_for_id(id)).size()); + synch_comm_data.synch_comm.finish_allreduce_impl(id, out_data.data()); + BOOST_TEST_REQUIRE(out_data == synch_comm_data.expected_output_bufs.at(synch_comm_data.get_index_for_id(id)), + per_element()); + } + } + } + + BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_f)); + BOOST_DATA_TEST_CASE(synch_comm_parallel_allred, test_cases * thresholds_bytes, test_case, threshold) + { + auto const op = collectives::Allreduce::Operator::ReductionOp::AVERAGE; + SynchCommTestData synch_comm_data(test_case, op, threshold); + execute_iteration(synch_comm_data); + } + + BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_f)); + BOOST_DATA_TEST_CASE(synch_comm_repeat_parallel_allred, test_cases * thresholds_bytes, test_case, threshold) + { + auto const op = collectives::Allreduce::Operator::ReductionOp::AVERAGE; + auto nreps = 10UL; + SynchCommTestData synch_comm_data(test_case, op, threshold); + for (auto i = 0UL; i < nreps; ++i) + { + execute_iteration(synch_comm_data); + } + } + + BOOST_AUTO_TEST_SUITE_END() +} diff --git a/test/TensorFusor.cpp b/test/TensorFusor.cpp new file mode 100644 index 00000000..0a96e045 --- /dev/null +++ b/test/TensorFusor.cpp @@ -0,0 +1,176 @@ +#include "FusedTensorInfo.hpp" +#include "utilities.hpp" + +#include +#include + +namespace tarantella +{ + namespace collectives + { + std::vector> non_fusion_test_cases + { + { + // test case #1 + // (tensor_id, num_elements, element_type) + {1, 8, BufferElementType::FLOAT} + }, + { + // test case #2 + {5, 4 * 1000, BufferElementType::FLOAT} + }, + { + // test case #3 + {5, 10, BufferElementType::FLOAT}, + {9, 10, BufferElementType::FLOAT} + }, + { + // test case #4 + {1, 8, BufferElementType::FLOAT}, + {2, 8, BufferElementType::FLOAT}, + {3, 8, BufferElementType::FLOAT}, + {4, 8, BufferElementType::FLOAT}, + {5, 8, BufferElementType::FLOAT}, + {6, 9, BufferElementType::FLOAT}, + }, + { + // test case #5 + {9, 3, BufferElementType::FLOAT}, + {1, 8, BufferElementType::FLOAT}, + {0, 9, BufferElementType::FLOAT}, + {4, 1, BufferElementType::FLOAT}, + {5, 17, BufferElementType::FLOAT}, + {6, 5, BufferElementType::FLOAT}, + }, + }; + + std::vector fusion_test_case + { + {0, 94, BufferElementType::FLOAT}, + {1, 17, BufferElementType::FLOAT}, + {2, 2, BufferElementType::FLOAT}, + {3, 81, BufferElementType::FLOAT}, + }; + + class GetResults + { + public: + GetResults(std::size_t threshold, std::vector const& test_case) + : id_map{}, + info_map{} + { + TensorFusor fusor {threshold}; + fusor.fuse_tensor_infos_and_ids(test_case, id_map, info_map); + } + + TensorFusor::IDMap id_map; + TensorFusor::InfoMap info_map; + }; + + class GetZeroThresholdReferenceResults + { + public: + GetZeroThresholdReferenceResults(std::vector const& tensor_infos) + : id_map(generate_id_map(tensor_infos)), + info_map(generate_info_map(tensor_infos)) + { } + + TensorFusor::IDMap generate_id_map(std::vector const& tensor_infos) + { + TensorFusor::IDMap map {}; + for (auto const& tinfo : tensor_infos) + { + auto const id = tinfo.get_id(); + map[id] = id; + } + return map; + } + + TensorFusor::InfoMap generate_info_map(std::vector const& tensor_infos) + { + TensorFusor::InfoMap map {}; + for (auto const& tinfo : tensor_infos) + { + auto const id = tinfo.get_id(); + map[id] = tinfo; + } + return map; + } + + TensorFusor::IDMap id_map; + TensorFusor::InfoMap info_map; + }; + + BOOST_AUTO_TEST_SUITE(tensor_fusor_unit) + BOOST_DATA_TEST_CASE(tensor_fusor_with_zero_threshold, non_fusion_test_cases, test_case) + { + GetResults results {0UL, test_case}; + GetZeroThresholdReferenceResults reference {test_case}; + + BOOST_TEST_REQUIRE(results.id_map == reference.id_map); + BOOST_TEST_REQUIRE(results.info_map == reference.info_map); + } + + BOOST_AUTO_TEST_CASE(tensor_fusor_with_threshold_2_floats) + { + GetResults results {2UL*4UL, fusion_test_case}; + GetZeroThresholdReferenceResults reference {fusion_test_case}; + + BOOST_TEST_REQUIRE(results.id_map == reference.id_map); + BOOST_TEST_REQUIRE(results.info_map == reference.info_map); + } + + BOOST_AUTO_TEST_CASE(tensor_fusor_with_threshold_10_floats) + { + GetResults results {10UL*4UL, fusion_test_case}; + + BOOST_TEST_REQUIRE(results.id_map.find(0UL)->second == 0UL); + BOOST_TEST_REQUIRE(results.id_map.find(1UL)->second == 1UL); + BOOST_TEST_REQUIRE(results.id_map.find(2UL)->second == 2UL); + BOOST_TEST_REQUIRE(results.id_map.find(3UL)->second == 2UL); + + BOOST_TEST_REQUIRE(results.info_map.find(0UL)->second.get_nelems() == 94UL); + BOOST_TEST_REQUIRE(results.info_map.find(1UL)->second.get_nelems() == 17UL); + BOOST_TEST_REQUIRE(results.info_map.find(2UL)->second.get_nelems() == 83UL); + } + + BOOST_AUTO_TEST_CASE(tensor_fusor_with_threshold_100_floats) + { + GetResults results {100UL*4UL, fusion_test_case}; + + BOOST_TEST_REQUIRE(results.id_map.find(0UL)->second == 0UL); + BOOST_TEST_REQUIRE(results.id_map.find(1UL)->second == 0UL); + BOOST_TEST_REQUIRE(results.id_map.find(2UL)->second == 2UL); + BOOST_TEST_REQUIRE(results.id_map.find(3UL)->second == 2UL); + + BOOST_TEST_REQUIRE(results.info_map.find(0UL)->second.get_nelems() == 111UL); + BOOST_TEST_REQUIRE(results.info_map.find(2UL)->second.get_nelems() == 83UL); + } + + BOOST_AUTO_TEST_CASE(tensor_fusor_with_threshold_112_floats) + { + GetResults results {112UL*4UL, fusion_test_case}; + + BOOST_TEST_REQUIRE(results.id_map.find(0UL)->second == 0UL); + BOOST_TEST_REQUIRE(results.id_map.find(1UL)->second == 0UL); + BOOST_TEST_REQUIRE(results.id_map.find(2UL)->second == 0UL); + BOOST_TEST_REQUIRE(results.id_map.find(3UL)->second == 3UL); + + BOOST_TEST_REQUIRE(results.info_map.find(0UL)->second.get_nelems() == 113UL); + BOOST_TEST_REQUIRE(results.info_map.find(3UL)->second.get_nelems() == 81UL); + } + + BOOST_AUTO_TEST_CASE(tensor_fusor_with_threshold_200_floats) + { + GetResults results {200UL*4UL, fusion_test_case}; + + BOOST_TEST_REQUIRE(results.id_map.find(0UL)->second == 0UL); + BOOST_TEST_REQUIRE(results.id_map.find(1UL)->second == 0UL); + BOOST_TEST_REQUIRE(results.id_map.find(2UL)->second == 0UL); + BOOST_TEST_REQUIRE(results.id_map.find(3UL)->second == 0UL); + + BOOST_TEST_REQUIRE(results.info_map.find(0UL)->second.get_nelems() == 194UL); + } + BOOST_AUTO_TEST_SUITE_END() + } +} diff --git a/test/collectives/AllreduceButterfly.cpp b/test/collectives/AllreduceButterfly.cpp new file mode 100644 index 00000000..b8e6f65f --- /dev/null +++ b/test/collectives/AllreduceButterfly.cpp @@ -0,0 +1,198 @@ +#include "AllreduceTestSetupGenerator.hpp" +#include "allreduceButterfly.h" +#include "collectives/barrier/GPIBarrier.hpp" +#include "GlobalContextFixture.hpp" +#include "gpi/ResourceManager.hpp" +#include "utilities.hpp" + +#include +#include + +using boost::test_tools::per_element; + +namespace tarantella +{ + BOOST_GLOBAL_FIXTURE( GlobalContext ); + + float const epsilon_f(1e-6); + double const epsilon_d(1e-12); + + // Test cases defining input buffers for Allreduce on a number of ranks given by + // the number of buffers in each test case + std::vector test_cases + { + { // test case #1 (nelems = 0) + {} // rank0 + }, + { // test case #2 (nelems = 1) + {1} // rank0 + }, + { // test case #3 + {2.34, 3, 4, 5, 6} // rank0 + }, + { // test case #4 (nelems > nranks, nelems%nranks == 0) + {1, 2, 3, 0.8}, // rank0 + {0.1, 0.2, 5, 6} // rank1 + }, + // { // test case #5 (nelems = 1) + // {2}, // rank0 + // {3}, // rank1 + // {4} // rank2 + // }, + { // test case #6 (nelems > nranks, nelems%nranks >0) + {1, 2, 3, 0.8}, // rank0 + {0.1, 0.2, 5, 6}, // rank1 + {0.1, 0.2, 5, 6} // rank2 + }, + { // test case #7 (nelems == nranks) + {1, 3, 4, 5}, // rank0 + {2, 6, 77, 777}, // rank1 + {3, 42, 55, 2123}, // rank2 + {4, 423, 7, 4}, // rank3 + }, + { // test case #8 (nelems > nranks, nelems%nranks >0) + {1, 3, 4, 5, 1}, // rank0 + {2, 6, 77, 777, 1}, // rank1 + {3, 42, 55, 2123, 1}, // rank2 + {4, 423, 7, 4, 1}, // rank3 + }, + // { // test case #9 (nelems < nranks) + // {1, 3, 4}, // rank0 + // {2, 6, 77}, // rank1 + // {3, 42, 55}, // rank2 + // {4, 423, 7}, // rank3 + // }, + // { // test case #10 (nelems = 1) + // {1}, // rank0 + // {2}, // rank1 + // {3}, // rank2 + // {4}, // rank3 + // }, + }; + + template + void exec_allreduce(tarantella::GPI::Context& context, TestCase const& test_case) + { + if (context.get_comm_size() < test_case.size()) + { + throw std::logic_error("Allreduce test with fewer processes than required by test case"); + } + + // allocate group for the number of ranks defined by the test case + GPI::Group const group(gen_group_ranks(test_case.size())); + + // resource configuration for the test case + gaspi_notification_id_t const first_notification_id = 42; + GPI::SegmentID data_segment_id = 1; + GPI::SegmentID comm_segment_id = 2; + auto const data_segment_size = std::max(size_t(1), test_case[0].size() * getDataTypeSize(datatype)); + auto const comm_segment_size = std::max(size_t(1), + collectives::allreduceButterfly::getNumberOfElementsSegmentCommunicate( + test_case[0].size(), test_case.size()) * getDataTypeSize(datatype)); + + // use new segment manager for each test case and release the resources at the end + GPI::SegmentManager segmentmanager(context); + + if (group.contains_rank(context.get_rank())) + { + BOOST_REQUIRE_NO_THROW(segmentmanager.create_segment(data_segment_id, group, data_segment_size)); + BOOST_REQUIRE_NO_THROW(segmentmanager.create_segment(comm_segment_id, group, comm_segment_size)); + } + else + { + BOOST_REQUIRE_THROW(segmentmanager.create_segment(data_segment_id, group, data_segment_size), + std::runtime_error); + BOOST_REQUIRE_THROW(segmentmanager.create_segment(comm_segment_id, group, comm_segment_size), + std::runtime_error); + } + + collectives::Barrier::GPIBarrierAllRanks barrier; + barrier.blocking_barrier(); + + if (group.contains_rank(context.get_rank())) + { + // only processes in the group execute the Allreduce + AllreduceTestSetupGenerator test(context, test_case, + data_segment_id, comm_segment_id, + first_notification_id); + collectives::allreduceButterfly allreduce(test.input_buf.size(), + test.get_elem_type(), + op, + test.data_seg_buffer, + test.comm_seg_buffer, + test.queue_handler, + group); + + test.copy_data_to_segment(allreduce.getReducePointer()); + allreduce.signal(); + + while (allreduce()); + + auto output_buf = test.copy_results_from_segment(allreduce.getReducePointer()); + BOOST_TEST_REQUIRE(output_buf == test.expected_output_buf, per_element()); + } + else // other processes should not be defined in the test case + { + BOOST_TEST_REQUIRE(context.get_rank() >= test_case.size()); + } + + // make sure all processes have finished before cleanup + barrier.blocking_barrier(); + } + + BOOST_AUTO_TEST_SUITE(allreduce_butterfly_unit) + + BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_f)); + BOOST_DATA_TEST_CASE(simple_allreduce_float_sum, test_cases, test_case) + { + exec_allreduce + (GlobalContext::instance()->gpi_cont, test_case); + } + + BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_f)); + BOOST_DATA_TEST_CASE(simple_allreduce_float_avg, test_cases, test_case) + { + exec_allreduce + (GlobalContext::instance()->gpi_cont, test_case); + } + + BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_d)); + BOOST_DATA_TEST_CASE(simple_allreduce_double_sum, test_cases, test_case) + { + exec_allreduce + (GlobalContext::instance()->gpi_cont, test_case); + } + + BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_d)); + BOOST_DATA_TEST_CASE(simple_allreduce_double_avg, test_cases, test_case) + { + exec_allreduce + (GlobalContext::instance()->gpi_cont, test_case); + } + + BOOST_DATA_TEST_CASE(simple_allreduce_int32_sum, test_cases, test_case) + { + exec_allreduce + (GlobalContext::instance()->gpi_cont, test_case); + } + + BOOST_DATA_TEST_CASE(simple_allreduce_int32_avg, test_cases, test_case) + { + exec_allreduce + (GlobalContext::instance()->gpi_cont, test_case); + } + + BOOST_DATA_TEST_CASE(simple_allreduce_int16_sum, test_cases, test_case) + { + exec_allreduce + (GlobalContext::instance()->gpi_cont, test_case); + } + + BOOST_DATA_TEST_CASE(simple_allreduce_int16_avg, test_cases, test_case) + { + exec_allreduce + (GlobalContext::instance()->gpi_cont, test_case); + } + BOOST_AUTO_TEST_SUITE_END() +} + diff --git a/test/collectives/AllreduceButterflyDoubleBuffer.cpp b/test/collectives/AllreduceButterflyDoubleBuffer.cpp new file mode 100644 index 00000000..77f98d3b --- /dev/null +++ b/test/collectives/AllreduceButterflyDoubleBuffer.cpp @@ -0,0 +1,214 @@ +#include "AllreduceTestSetupGenerator.hpp" +#include "allreduceButterflyDoubleBuffer.h" +#include "collectives/barrier/GPIBarrier.hpp" +#include "GlobalContextFixture.hpp" +#include "gpi/ResourceManager.hpp" + +#include +#include +#include + +using boost::test_tools::per_element; + +namespace tarantella +{ + namespace + { + std::vector gen_group_ranks(size_t nranks_in_group) + { + std::vector group_ranks(nranks_in_group); + std::iota(group_ranks.begin(), group_ranks.end(), 0); + return group_ranks; + } + } + + BOOST_GLOBAL_FIXTURE( GlobalContext ); + + float const epsilon_f(1e-6); + double const epsilon_d(1e-12); + + // Test cases defining input buffers for Allreduce on a number of ranks given by + // the number of buffers in each test case + std::vector test_cases + { + { // test case #1 (nelems = 0) + {} // rank0 + }, + { // test case #2 (nelems = 1) + {1} // rank0 + }, + { // test case #3 + {2.34, 3, 4, 5, 6} // rank0 + }, + { // test case #4 (nelems > nranks, nelems%nranks == 0) + {1, 2, 3, 0.8}, // rank0 + {0.1, 0.2, 5, 6} // rank1 + }, + // { // test case #5 (nelems = 1) + // {2,3}, // rank0 + // {3,4}, // rank1 + // {4,4} // rank2 + // }, + { // test case #6 (nelems > nranks, nelems%nranks >0) + {1, 2, 3, 0.8}, // rank0 + {0.1, 0.2, 5, 6}, // rank1 + {0.1, 0.2, 5, 6} // rank2 + }, + { // test case #7 (nelems == nranks) + {1, 3, 4, 5}, // rank0 + {2, 6, 77, 777}, // rank1 + {3, 42, 55, 2123}, // rank2 + {4, 423, 7, 4}, // rank3 + }, + { // test case #8 (nelems > nranks, nelems%nranks >0) + {1, 3, 4, 5, 1}, // rank0 + {2, 6, 77, 777, 1}, // rank1 + {3, 42, 55, 2123, 1}, // rank2 + {4, 423, 7, 4, 1}, // rank3 + }, + // { // test case #9 (nelems < nranks) + // {1, 3, 4}, // rank0 + // {2, 6, 77}, // rank1 + // {3, 42, 55}, // rank2 + // {4, 423, 7}, // rank3 + // }, + // { // test case #10 (nelems = 1) + // {1}, // rank0 + // {2}, // rank1 + // {3}, // rank2 + // {4}, // rank3 + // }, + }; + + template + void exec_allreduce_double_buffer(tarantella::GPI::Context& context, TestCase const& test_case) + { + if (context.get_comm_size() < test_case.size()) + { + throw std::logic_error("Allreduce test with fewer processes than required by test case"); + } + + // allocate group for the number of ranks defined by the test case + GPI::Group const group(gen_group_ranks(test_case.size())); + + // resource configuration for the test case + gaspi_notification_id_t const first_notification_id = 42; + GPI::SegmentID data_segment_id0 = 1; + GPI::SegmentID data_segment_id1 = 2; + GPI::SegmentID comm_segment_id = 3; + auto const data_segment_size = std::max(size_t(1), test_case[0].size() * getDataTypeSize(datatype)); + auto const comm_segment_size = std::max(size_t(1), + collectives::allreduceButterfly::getNumberOfElementsSegmentCommunicate( + test_case[0].size(), test_case.size()) * getDataTypeSize(datatype)); + + // use new segment manager for each test case and release the resources at the end + GPI::SegmentManager segmentmanager(context); + + if (group.contains_rank(context.get_rank())) + { + BOOST_REQUIRE_NO_THROW(segmentmanager.create_segment(data_segment_id0, group, data_segment_size)); + BOOST_REQUIRE_NO_THROW(segmentmanager.create_segment(data_segment_id1, group, data_segment_size)); + BOOST_REQUIRE_NO_THROW(segmentmanager.create_segment(comm_segment_id, group, comm_segment_size)); + } + else + { + BOOST_REQUIRE_THROW(segmentmanager.create_segment(data_segment_id0, group, data_segment_size), + std::runtime_error); + BOOST_REQUIRE_THROW(segmentmanager.create_segment(data_segment_id1, group, data_segment_size), + std::runtime_error); + BOOST_REQUIRE_THROW(segmentmanager.create_segment(comm_segment_id, group, comm_segment_size), + std::runtime_error); + } + + collectives::Barrier::GPIBarrierAllRanks barrier; + barrier.blocking_barrier(); + + if (group.contains_rank(context.get_rank())) + { + // only processes in the group execute the Allreduce + AllreduceDoubleBufferTestSetupGenerator test(context, test_case, + data_segment_id0, data_segment_id1, + comm_segment_id, + first_notification_id); + collectives::allreduceButterflyDoubleBuffer allreduce(test.input_buf.size(), + test.get_elem_type(), + op, + test.data_seg_buffer, + test.additional_data_seg_buffer, + test.comm_seg_buffer, + test.queue_handler, + group); + + test.copy_data_to_segment(allreduce.getActiveReducePointer()); + allreduce.signal(); + + while (allreduce()); + + auto output_buf = test.copy_results_from_segment(allreduce.getResultsPointer()); + BOOST_TEST_REQUIRE(output_buf == test.expected_output_buf, per_element()); + } + else // other processes should not be defined in the test case + { + BOOST_TEST_REQUIRE(context.get_rank() >= test_case.size()); + } + + // make sure all processes have finished before cleanup + barrier.blocking_barrier(); + } + + BOOST_AUTO_TEST_SUITE(allreduce_butterfly_unit) + + BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_f)); + BOOST_DATA_TEST_CASE(simple_allreduce_float_sum, test_cases, test_case) + { + exec_allreduce_double_buffer + (GlobalContext::instance()->gpi_cont, test_case); + } + + BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_f)); + BOOST_DATA_TEST_CASE(simple_allreduce_float_avg, test_cases, test_case) + { + exec_allreduce_double_buffer + (GlobalContext::instance()->gpi_cont, test_case); + } + + BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_d)); + BOOST_DATA_TEST_CASE(simple_allreduce_double_sum, test_cases, test_case) + { + exec_allreduce_double_buffer + (GlobalContext::instance()->gpi_cont, test_case); + } + + BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_d)); + BOOST_DATA_TEST_CASE(simple_allreduce_double_avg, test_cases, test_case) + { + exec_allreduce_double_buffer + (GlobalContext::instance()->gpi_cont, test_case); + } + + BOOST_DATA_TEST_CASE(simple_allreduce_int32_sum, test_cases, test_case) + { + exec_allreduce_double_buffer + (GlobalContext::instance()->gpi_cont, test_case); + } + + BOOST_DATA_TEST_CASE(simple_allreduce_int32_avg, test_cases, test_case) + { + exec_allreduce_double_buffer + (GlobalContext::instance()->gpi_cont, test_case); + } + + BOOST_DATA_TEST_CASE(simple_allreduce_int16_sum, test_cases, test_case) + { + exec_allreduce_double_buffer + (GlobalContext::instance()->gpi_cont, test_case); + } + + BOOST_DATA_TEST_CASE(simple_allreduce_int16_avg, test_cases, test_case) + { + exec_allreduce_double_buffer + (GlobalContext::instance()->gpi_cont, test_case); + } + BOOST_AUTO_TEST_SUITE_END() +} + diff --git a/test/collectives/AllreduceTestSetupGenerator.hpp b/test/collectives/AllreduceTestSetupGenerator.hpp new file mode 100644 index 00000000..4ca96edb --- /dev/null +++ b/test/collectives/AllreduceTestSetupGenerator.hpp @@ -0,0 +1,157 @@ +#pragma once + +#include "allreduceButterfly.h" +#include "gpi/Context.hpp" + +#include +#include +#include + +#include + +using TestCase = std::vector>; +namespace std +{ + std::ostream &operator<<(std::ostream &os, TestCase const &test) + { + for (auto i = 0U; i < test.size(); ++i) + { + os << "Data for rank " << i << "/" << test.size() << ": ["; + for (auto const elem : test[i]) + { + os << elem << " "; + } + os << "]" << std::endl; + } + return os; + } +} +namespace tarantella +{ + using AllreduceDataType = collectives::allreduce::dataType; + using AllreduceOp = collectives::allreduce::reductionType; + // create expected allreduce results buffers for each test case + template + class AllreduceTestSetupGenerator + { + // determine the Allreduce element types based on the datatype template parameter + using BufferType = typename std::conditional::type >::type >::type; + + public: + AllreduceTestSetupGenerator(tarantella::GPI::Context& ctx, TestCase const& data, + tarantella::GPI::SegmentID data_segment_id, + tarantella::GPI::SegmentID comm_segment_id, + gaspi_notification_id_t first_notification_id) + : context(ctx), + first_notification_id(first_notification_id), + group_size(data.size()), + data_seg_buffer({data_segment_id, offset, first_notification_id}), + comm_seg_buffer({comm_segment_id, offset, first_notification_id}), + input_buf(generate_rank_input_buf(data, context.get_rank())), + expected_output_buf(generate_expected_output_buf(data, op)) + {} + virtual ~AllreduceTestSetupGenerator() = default; + + void copy_data_to_segment(void* seg_ptr) + { + std::memcpy(seg_ptr, input_buf.data(), input_buf.size()*sizeof(BufferType)); + } + + std::vector copy_results_from_segment(void* seg_ptr) + { + std::vector output_buf(input_buf.size()); + std::memcpy(output_buf.data(), seg_ptr, input_buf.size()*sizeof(BufferType)); + return output_buf; + } + + AllreduceDataType get_elem_type() const {return T;}; + + GPI::Context& context; + size_t const offset = 0; + gaspi_notification_id_t const first_notification_id; + size_t group_size; + + collectives::allreduceButterfly::segmentBuffer data_seg_buffer; + collectives::allreduceButterfly::segmentBuffer comm_seg_buffer; + collectives::queues queue_handler; + std::vector input_buf; + std::vector expected_output_buf; + + private: + + std::vector generate_rank_input_buf(TestCase const& data, gaspi_rank_t const rank) + { + std::vector in_buf; + BOOST_TEST_REQUIRE(rank < group_size); + std::transform(data[rank].begin(), data[rank].end(), + std::back_inserter(in_buf), + [](auto elem) { return static_cast(elem); }); + return in_buf; + } + + std::vector generate_expected_output_buf(TestCase const& data, + AllreduceOp operation) + { + std::vector out_buf; + switch (operation) + { + case AllreduceOp::SUM: + { + out_buf = compute_sum_over_ranks(data); + break; + } + case AllreduceOp::AVERAGE: + { + out_buf = compute_sum_over_ranks(data); + std::transform(out_buf.begin(), out_buf.end(), + out_buf.begin(), + [group_size=group_size](auto elem) { return elem/static_cast(group_size);} + ); + break; + } + default: + { + throw std::runtime_error("[AllreduceTestSetupGenerator] Unknown reduction operation"); + } + } + return out_buf; + } + + std::vector compute_sum_over_ranks(TestCase const& data) + { + std::vector out_buf(data.front().size()); + for (auto const& buffer : data) + { + std::transform(buffer.begin(), buffer.end(), out_buf.begin(), + out_buf.begin(), + [](auto elem1, auto elem2) { + return static_cast(elem1) + static_cast(elem2);} + ); + } + return out_buf; + } + }; + + template + class AllreduceDoubleBufferTestSetupGenerator : public AllreduceTestSetupGenerator + { + + public: + AllreduceDoubleBufferTestSetupGenerator(tarantella::GPI::Context& ctx, TestCase const& data, + tarantella::GPI::SegmentID data_segment_id0, + tarantella::GPI::SegmentID data_segment_id1, + tarantella::GPI::SegmentID comm_segment_id, + gaspi_notification_id_t first_notification_id) + : AllreduceTestSetupGenerator(ctx, data, + data_segment_id0, comm_segment_id, + first_notification_id), + additional_data_seg_buffer({data_segment_id1, this->offset, this->first_notification_id}) + {} + virtual ~AllreduceDoubleBufferTestSetupGenerator() = default; + + collectives::allreduceButterfly::segmentBuffer additional_data_seg_buffer; + }; +} diff --git a/test/collectives/CMakeLists.txt b/test/collectives/CMakeLists.txt new file mode 100644 index 00000000..3051275f --- /dev/null +++ b/test/collectives/CMakeLists.txt @@ -0,0 +1,26 @@ + +include (add_test_wrappers) + +set (include_dirs ${CMAKE_SOURCE_DIR}/test + ${CMAKE_SOURCE_DIR}/src/gpi_comm_lib + ${CMAKE_SOURCE_DIR}/src/gpi_comm_lib/collectives/lib) + +tarantella_compile_and_generate_test(NAME Allreduce_basic + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/allreduce_basic.cpp + LIBRARIES tnt::gpicommlib + INCLUDE_DIRECTORIES ${include_dirs}) + +set(localranks_list 8) +tarantella_compile_and_generate_gpi_test(NAME AllreduceButterfly + LOCALRANKS_LIST "${localranks_list}" + TIMEOUT 20 + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/AllreduceButterfly.cpp + LIBRARIES tnt::gpicommlib + INCLUDE_DIRECTORIES ${include_dirs}) + +tarantella_compile_and_generate_gpi_test(NAME AllreduceButterflyDoubleBuffer + LOCALRANKS_LIST "${localranks_list}" + TIMEOUT 20 + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/AllreduceButterflyDoubleBuffer.cpp + LIBRARIES tnt::gpicommlib + INCLUDE_DIRECTORIES ${include_dirs}) \ No newline at end of file diff --git a/test/collectives/allreduce_basic.cpp b/test/collectives/allreduce_basic.cpp new file mode 100644 index 00000000..8130add3 --- /dev/null +++ b/test/collectives/allreduce_basic.cpp @@ -0,0 +1,63 @@ + +#include "AllreduceTestSetupGenerator.hpp" +#include "GlobalContextFixture.hpp" +#include "allreduceButterfly.h" + +#include + +#include +#include + +using boost::test_tools::per_element; + +struct AllreduceBasicTestCase +{ + unsigned long nelems; + unsigned long nprocs; + unsigned long expected_size_comm_seg; + unsigned long expected_nnotifs; +}; +namespace std +{ + std::ostream& operator<<(std::ostream& os, AllreduceBasicTestCase const& test) + { + os << "Nelems=" << test.nelems << ", Nprocs=" << test.nprocs; + os << std::endl; + return os; + } +} + +namespace tarantella +{ + std::vector test_cases + { + // nelems, nprocs, expected_size_comm_buf, expected_nnotifs + { 1, 1, 0, 0}, + { 5, 1, 0, 0}, + { 1, 2, 1, 1}, + { 2, 2, 1, 1}, + { 7, 2, 4, 1}, + { 1, 4, 3, 2}, + { 4, 4, 3, 2}, + }; + + BOOST_AUTO_TEST_SUITE(allreduce_basic_unit) + + BOOST_DATA_TEST_CASE(allreduce_size_segm_comm, test_cases, test_case) + { + auto nelems_buffer = test_case.nelems; + auto nprocs = test_case.nprocs; + + auto nelems_segment_comm = collectives::allreduceButterfly::getNumberOfElementsSegmentCommunicate( + nelems_buffer, nprocs) ; + BOOST_TEST_REQUIRE(nelems_segment_comm == test_case.expected_size_comm_seg); + } + + BOOST_DATA_TEST_CASE(allreduce_nnotifications, test_cases, test_case) + { + auto nprocs = test_case.nprocs; + auto nnotifications = collectives::allreduceButterfly::getNumberOfNotifications(nprocs); + BOOST_TEST_REQUIRE(nnotifications == test_case.expected_nnotifs); + } + BOOST_AUTO_TEST_SUITE_END() +} diff --git a/test/gpi/CMakeLists.txt b/test/gpi/CMakeLists.txt new file mode 100644 index 00000000..e46ceb4f --- /dev/null +++ b/test/gpi/CMakeLists.txt @@ -0,0 +1,44 @@ + +include (add_test_wrappers) +include (parse_arguments) + +set (include_dirs ${CMAKE_SOURCE_DIR}/src/gpi_comm_lib + ${CMAKE_SOURCE_DIR}/test) + +set(localranks_list 1 2 4 5 7) +tarantella_compile_and_generate_gpi_test(NAME Context + LOCALRANKS_LIST "${localranks_list}" + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/Context.cpp + LIBRARIES tnt::gpicommlib + INCLUDE_DIRECTORIES ${include_dirs}) + +tarantella_compile_and_generate_gpi_test(NAME Group + LOCALRANKS_LIST "${localranks_list}" + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/Group.cpp + LIBRARIES tnt::gpicommlib + INCLUDE_DIRECTORIES ${include_dirs}) + +tarantella_compile_and_generate_gpi_test(NAME GroupManager + LOCALRANKS_LIST "${localranks_list}" + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/GroupManager.cpp + LIBRARIES tnt::gpicommlib + INCLUDE_DIRECTORIES ${include_dirs}) + +tarantella_compile_and_generate_gpi_test(NAME QueueManager + LOCALRANKS_LIST "${localranks_list}" + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/QueueManager.cpp + LIBRARIES tnt::gpicommlib + INCLUDE_DIRECTORIES ${include_dirs}) + +tarantella_compile_and_generate_gpi_test(NAME NotificationManager + LOCALRANKS_LIST "${localranks_list}" + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/NotificationManager.cpp + LIBRARIES tnt::gpicommlib + INCLUDE_DIRECTORIES ${include_dirs}) + +tarantella_compile_and_generate_gpi_test(NAME SegmentManager + LOCALRANKS_LIST "${localranks_list}" + SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/SegmentManager.cpp + LIBRARIES tnt::gpicommlib + INCLUDE_DIRECTORIES ${include_dirs}) + diff --git a/test/gpi/Context.cpp b/test/gpi/Context.cpp new file mode 100644 index 00000000..26b570c3 --- /dev/null +++ b/test/gpi/Context.cpp @@ -0,0 +1,47 @@ +#include "collectives/barrier/GPIBarrier.hpp" +#include "GlobalContextFixture.hpp" +#include "gpi/gaspiCheckReturn.hpp" +#include "utilities.hpp" + +#include +#include + +#include +#include + +namespace tarantella +{ + BOOST_GLOBAL_FIXTURE( GlobalContext ); + namespace + { + std::size_t get_num_allocated_segments() + { + gaspi_number_t allocated_segments_num; + tarantella::GPI::gaspiCheckReturn(gaspi_segment_num(&allocated_segments_num), + "get number of segments"); + return allocated_segments_num; + } + } + + BOOST_AUTO_TEST_CASE(gpicontext_comm_size) + { + BOOST_TEST(GlobalContext::instance()->gpi_cont.get_comm_size() > 0); + } + + BOOST_AUTO_TEST_CASE(gpicontext_allocate_segment) + { + auto &context = GlobalContext::instance()->gpi_cont; + GPI::Group group(gen_group_ranks(context.get_comm_size())); + GPI::SegmentID segment_id = 0; + std::size_t size_in_bytes = 1000; + + BOOST_REQUIRE_NO_THROW(context.allocate_segment(segment_id, group, size_in_bytes)); + collectives::Barrier::GPIBarrierAllRanks barrier; + barrier.blocking_barrier(); + + BOOST_TEST_REQUIRE(get_num_allocated_segments() == 1); + + BOOST_REQUIRE_NO_THROW(context.deallocate_segment(segment_id, group)); + BOOST_TEST_REQUIRE(get_num_allocated_segments() == 0); + } +} diff --git a/test/gpi/Group.cpp b/test/gpi/Group.cpp new file mode 100644 index 00000000..4fb035a1 --- /dev/null +++ b/test/gpi/Group.cpp @@ -0,0 +1,98 @@ +#include "GlobalContextFixture.hpp" +#include "gpi/Group.hpp" +#include "utilities.hpp" + +#include +#include +#include +#include +#include + +#include +#include + +namespace tarantella +{ + BOOST_GLOBAL_FIXTURE( GlobalContext ); + + BOOST_AUTO_TEST_CASE(gpigroup_allocate_one_group) + { + auto& context = GlobalContext::instance()->gpi_cont; + auto nranks_in_group = context.get_comm_size() - 1; + if (nranks_in_group > 0) + { + auto const group_ranks = gen_group_ranks(nranks_in_group); + BOOST_REQUIRE_NO_THROW(GPI::Group const group(group_ranks)); + } + } + + BOOST_AUTO_TEST_CASE(gpigroup_allocate_multiple_group_all) + { + auto &context = GlobalContext::instance()->gpi_cont; + auto const group_ranks = gen_group_ranks(context.get_comm_size()); + GPI::Group const group1(group_ranks); + GPI::Group const group2(group_ranks); + GPI::Group const group3(group_ranks); + + BOOST_TEST_REQUIRE(group1.get_size() == group_ranks.size()); + BOOST_TEST_REQUIRE(group2.get_size() == group_ranks.size()); + BOOST_TEST_REQUIRE(group3.get_size() == group_ranks.size()); + } + + BOOST_AUTO_TEST_CASE(gpigroup_check_ranks_in_group) + { + auto &context = GlobalContext::instance()->gpi_cont; + + auto shuffled_ranks = gen_group_ranks(context.get_comm_size()); + std::shuffle(shuffled_ranks.begin(), shuffled_ranks.end(), std::mt19937(42)); + + size_t const nranks_in_group = context.get_comm_size() / 2; + if (nranks_in_group > 0) + { + auto group_ranks_list(shuffled_ranks); + group_ranks_list.resize(nranks_in_group); + GPI::Group const group(group_ranks_list); + + for (auto rank : shuffled_ranks) + { + auto const rank_iter = std::find(group_ranks_list.begin(), group_ranks_list.end(), rank); + if (rank_iter != group_ranks_list.end()) // ranks in the `group_ranks_list` should be found in the group + { + BOOST_TEST_REQUIRE(group.contains_rank(rank)); + } + else + { + BOOST_TEST_REQUIRE(!group.contains_rank(rank)); + } + } + } + } + + BOOST_AUTO_TEST_CASE(gpigroup_throw_allocate_empty_group) + { + std::vector group_ranks; + BOOST_REQUIRE_THROW(GPI::Group const group(group_ranks), std::runtime_error); + } + + BOOST_AUTO_TEST_CASE(gpigroup_multiple_overlapping_groups) + { + std::vector> allocated_groups; + auto& context = GlobalContext::instance()->gpi_cont; + + for (size_t nranks_in_group = 1; nranks_in_group <= context.get_comm_size(); ++nranks_in_group) + { + auto const group_ranks = gen_group_ranks(nranks_in_group); + BOOST_REQUIRE_NO_THROW(allocated_groups.emplace_back(std::make_unique(group_ranks))); + BOOST_TEST_REQUIRE(allocated_groups.back()->get_size() == nranks_in_group); + + if (context.get_rank() < nranks_in_group) // ranks lower than `nranks_in_group` should belong to the group + { + BOOST_TEST_REQUIRE(allocated_groups.back()->contains_rank(context.get_rank())); + } + else // other ranks should not be part of the group + { + BOOST_TEST_REQUIRE(!allocated_groups.back()->contains_rank(context.get_rank())); + } + } + } +} diff --git a/test/gpi/GroupManager.cpp b/test/gpi/GroupManager.cpp new file mode 100644 index 00000000..e7c9318c --- /dev/null +++ b/test/gpi/GroupManager.cpp @@ -0,0 +1,61 @@ +#include "GlobalContextFixture.hpp" +#include "gpi/GroupManager.hpp" +#include "utilities.hpp" + +#include + +namespace tarantella +{ + BOOST_GLOBAL_FIXTURE( GlobalContext ); + + BOOST_AUTO_TEST_SUITE(groupmanager_unit) + BOOST_AUTO_TEST_CASE(groupmanager_no_predefined_group) + { + GPI::GroupManager gmanager; + + auto const& groups = gmanager.get_groups(); + BOOST_TEST_REQUIRE(groups.size() == 0); + } + + BOOST_AUTO_TEST_CASE(groupmanager_create_group) + { + auto& context = GlobalContext::instance()->gpi_cont; + GPI::GroupManager gmanager; + + auto const group = gmanager.create_group(gen_group_ranks(context.get_comm_size())); + + BOOST_REQUIRE_NO_THROW(gmanager.get_groups()); + } + + BOOST_AUTO_TEST_CASE(groupmanager_create_empty_group) + { + GPI::GroupManager gmanager; + BOOST_REQUIRE_THROW(gmanager.create_group({}), std::runtime_error); + } + + BOOST_AUTO_TEST_CASE(groupmanager_create_multiple_groups) + { + auto& context = GlobalContext::instance()->gpi_cont; + GPI::GroupManager gmanager; + + for (auto group_size = 1UL; group_size <= context.get_comm_size(); ++group_size) + { + // create group regardless of whether it contains the current rank or not + BOOST_REQUIRE_NO_THROW(gmanager.create_group(gen_group_ranks(group_size))); + auto const&groups = gmanager.get_groups(); + + if (context.get_rank() < group_size) // groups contain consecutive ranks between [0, group_size) + { + BOOST_TEST_REQUIRE(groups.back().contains_rank(context.get_rank())); + } + else + { + BOOST_TEST_REQUIRE(!groups.back().contains_rank(context.get_rank())); + } + + } + auto num_created_groups = context.get_comm_size(); + BOOST_TEST_REQUIRE(num_created_groups == gmanager.get_groups().size()); + } + BOOST_AUTO_TEST_SUITE_END() +} diff --git a/test/gpi/NotificationManager.cpp b/test/gpi/NotificationManager.cpp new file mode 100644 index 00000000..7b46438a --- /dev/null +++ b/test/gpi/NotificationManager.cpp @@ -0,0 +1,106 @@ +#include "GlobalContextFixture.hpp" +#include "gpi/NotificationManager.hpp" + +#include + +#include +#include +#include + +#include + +namespace tarantella +{ + BOOST_GLOBAL_FIXTURE(GlobalContext); + + BOOST_AUTO_TEST_SUITE(notificationmanager_unit) + + BOOST_AUTO_TEST_CASE(notificationmanager_simple_range) + { + GPI::NotificationManager notif_manager; + GPI::SegmentID segment_id = 0; + notif_manager.register_segment(segment_id); + + std::size_t const num_notifications(10); + auto notification_range = notif_manager.get_notification_range(segment_id, num_notifications); + BOOST_TEST_REQUIRE(notification_range.first == 0); + BOOST_TEST_REQUIRE(notification_range.second == num_notifications); + } + + BOOST_AUTO_TEST_CASE(notificationmanager_throw_max_range) + { + GPI::NotificationManager notif_manager; + GPI::SegmentID segment_id = 0; + notif_manager.register_segment(segment_id); + + gaspi_number_t max_num_notifications; + gaspi_notification_num(&max_num_notifications); + + BOOST_REQUIRE_THROW(notif_manager.get_notification_range(segment_id, max_num_notifications + 1), + std::runtime_error); + + BOOST_REQUIRE_NO_THROW(notif_manager.get_notification_range(segment_id, max_num_notifications)); + } + + BOOST_AUTO_TEST_CASE(notificationmanager_allow_empty_range) + { + GPI::NotificationManager notif_manager; + GPI::SegmentID segment_id = 0; + notif_manager.register_segment(segment_id); + + std::size_t num_notifs = 0; + auto notification_range = notif_manager.get_notification_range(segment_id, num_notifs); + BOOST_TEST_REQUIRE(notification_range.first == notification_range.second); + } + + BOOST_AUTO_TEST_CASE(notificationmanager_consecutive_ranges) + { + GPI::NotificationManager notif_manager; + GPI::SegmentID segment_id = 0; + notif_manager.register_segment(segment_id); + + std::vector const notification_range_sizes{1, 10, 20, 100, 3}; + std::size_t previous_max_notification(0); + GPI::NotificationManager::NotificationRange notification_range; + for (auto num_notifs : notification_range_sizes) + { + notification_range = notif_manager.get_notification_range(segment_id, num_notifs); + BOOST_TEST_REQUIRE(notification_range.first == previous_max_notification); + BOOST_TEST_REQUIRE(notification_range.second == previous_max_notification + num_notifs); + + previous_max_notification += num_notifs; + } + + std::size_t total_num_notifs = std::accumulate(notification_range_sizes.begin(), + notification_range_sizes.end(), 0); + BOOST_TEST_REQUIRE(notification_range.second == total_num_notifs); + } + + BOOST_AUTO_TEST_CASE(notificationmanager_unregistered_segment) + { + GPI::NotificationManager notif_manager; + GPI::SegmentID segment_id = 1; + std::size_t num_notifs = 5; + + BOOST_REQUIRE_THROW(notif_manager.get_notification_range(segment_id, num_notifs), + std::runtime_error); + } + + BOOST_AUTO_TEST_CASE(notificationmanager_multiple_segments) + { + GPI::NotificationManager notif_manager; + std::vector segment_ids{1,2,3,4,5}; + std::size_t num_notifs = 5; + + for (auto const segment_id: segment_ids) + { + notif_manager.register_segment(segment_id); + } + for (auto const segment_id: segment_ids) + { + BOOST_REQUIRE_NO_THROW(notif_manager.get_notification_range(segment_id, num_notifs)); + } + } + + BOOST_AUTO_TEST_SUITE_END() +} diff --git a/test/gpi/QueueManager.cpp b/test/gpi/QueueManager.cpp new file mode 100644 index 00000000..0bf363e1 --- /dev/null +++ b/test/gpi/QueueManager.cpp @@ -0,0 +1,117 @@ +#include "collectives/barrier/GPIBarrier.hpp" +#include "GlobalContextFixture.hpp" +#include "gpi/QueueManager.hpp" +#include "gpi/Segment.hpp" +#include "utilities.hpp" + +#include + +#include + +namespace tarantella +{ + BOOST_GLOBAL_FIXTURE( GlobalContext ); + + namespace + { + void check_queue_id_valid(GPI::QueueID qid) + { + gaspi_number_t max_num_queues_allowed; + gaspi_queue_max(&max_num_queues_allowed); + BOOST_TEST_REQUIRE(qid < max_num_queues_allowed); + + gaspi_number_t queue_max_size; + gaspi_number_t queue_size; + gaspi_queue_size_max(&queue_max_size); + gaspi_queue_size(qid, &queue_size); + BOOST_TEST_REQUIRE(queue_size + 2 <= queue_max_size); + } + + void write_n_requests_to_neighbor(GPI::QueueManager &qmanager, + gaspi_number_t n_requests) + { + auto &context = GlobalContext::instance()->gpi_cont; + GPI::SegmentID segment_id = 0; + std::size_t size_in_bytes = 1000; + std::size_t offset = 1; + std::size_t buffer_size = 1; + gaspi_notification_t notif_value = 1; + GPI::Rank next_rank = (context.get_rank() + 1) % context.get_comm_size(); + + GPI::Group group(gen_group_ranks(context.get_comm_size())); + GPI::Segment segment(context, group, segment_id, size_in_bytes); + + collectives::Barrier::GPIBarrierAllRanks barrier; + barrier.blocking_barrier(); + + auto notif_range = std::make_pair(0, n_requests); + for (auto notif_id = notif_range.first; notif_id < notif_range.second; ++notif_id) + { + auto const qid = qmanager.get_queue_id_for_write_notify(); + check_queue_id_valid(qid); + gaspi_write_notify(segment.get_id(), offset, next_rank, + segment.get_id(), offset, buffer_size, + notif_id, notif_value, + qid, GASPI_BLOCK); + } + + for (auto i = 0UL; i < n_requests; ++i) + { + gaspi_notification_id_t notif_id; + gaspi_notify_waitsome(segment.get_id(), + notif_range.first, notif_range.second - notif_range.first, + ¬if_id, GASPI_BLOCK); + gaspi_notify_reset(segment.get_id(), notif_id, ¬if_value); + } + } + } + + BOOST_AUTO_TEST_SUITE(queuemanager_unit) + + BOOST_AUTO_TEST_CASE(queuemanager_request_queue) + { + auto& qmanager = GPI::QueueManager::get_instance(); + + auto const qid = qmanager.get_queue_id_for_write_notify(); + check_queue_id_valid(qid); + } + + BOOST_AUTO_TEST_CASE(queuemanager_request_multiple_queues_without_notif) + { + auto& qmanager = GPI::QueueManager::get_instance(); + std::size_t nqueues = 100; + + for (auto i = 0UL; i < nqueues; ++i) + { + auto const qid = qmanager.get_queue_id_for_write_notify(); + check_queue_id_valid(qid); + } + } + + BOOST_AUTO_TEST_CASE(queuemanager_use_multiple_queues) + { + auto& qmanager = GPI::QueueManager::get_instance(); + + gaspi_number_t max_queue_size; + gaspi_queue_size_max(&max_queue_size); + + gaspi_number_t number_queues; + gaspi_queue_num(&number_queues); + + auto const n_requests = 2 * max_queue_size / 2 * number_queues; + write_n_requests_to_neighbor(qmanager, n_requests); + qmanager.wait_and_flush_queue(); + + // all queues should be empty + gaspi_number_t num_queues; + gaspi_queue_num(&num_queues); + for (auto qid = 0UL; qid < num_queues; ++qid) + { + gaspi_number_t queue_size; + gaspi_queue_size(qid, &queue_size); + BOOST_TEST_REQUIRE(queue_size == 0); + } + } + + BOOST_AUTO_TEST_SUITE_END() +} diff --git a/test/gpi/SegmentManager.cpp b/test/gpi/SegmentManager.cpp new file mode 100644 index 00000000..d72fc363 --- /dev/null +++ b/test/gpi/SegmentManager.cpp @@ -0,0 +1,212 @@ +#include "collectives/barrier/GPIBarrier.hpp" +#include "GlobalContextFixture.hpp" +#include "gpi/gaspiCheckReturn.hpp" +#include "gpi/SegmentManager.hpp" +#include "utilities.hpp" + +#include + +#include +#include +#include +#include + +#include + +namespace tarantella +{ + BOOST_GLOBAL_FIXTURE( GlobalContext ); + + namespace + { + void create_segment_with_id(GPI::Context& context, GPI::SegmentManager& segmentmanager, + GPI::SegmentID segment_id, std::size_t size_in_bytes) + { + GPI::Group group(gen_group_ranks(context.get_comm_size())); + + segmentmanager.create_segment(segment_id, group, size_in_bytes); + collectives::Barrier::GPIBarrierAllRanks barrier; + barrier.blocking_barrier(); + } + } + + BOOST_AUTO_TEST_SUITE(segmentmanager_unit) + BOOST_AUTO_TEST_CASE(segmentmanager_create_manager) + { + auto &context = GlobalContext::instance()->gpi_cont; + BOOST_REQUIRE_NO_THROW(GPI::SegmentManager segmentmanager(context)); + } + + BOOST_AUTO_TEST_CASE(segmentmanager_create_segment) + { + auto &context = GlobalContext::instance()->gpi_cont; + GPI::SegmentID segment_id = 0; + std::size_t size_in_bytes = 1000; + GPI::SegmentManager segmentmanager(context); + BOOST_REQUIRE_NO_THROW(create_segment_with_id(context, segmentmanager, segment_id, size_in_bytes)); + } + + BOOST_AUTO_TEST_CASE(segmentmanager_create_empty_segment) + { + auto &context = GlobalContext::instance()->gpi_cont; + GPI::SegmentID segment_id = 0; + GPI::Group group(gen_group_ranks(context.get_comm_size())); + std::size_t size_zero = 0; + + GPI::SegmentManager segmentmanager(context); + BOOST_REQUIRE_THROW(segmentmanager.create_segment(segment_id, group, size_zero), + std::runtime_error); + } + + BOOST_AUTO_TEST_CASE(segmentmanager_create_multiple_segments) + { + auto &context = GlobalContext::instance()->gpi_cont; + std::vector segment_ids{1,5,6,31}; + std::size_t size_in_bytes = 1000; + GPI::SegmentManager segmentmanager(context); + + for (auto segment_id : segment_ids) + { + BOOST_REQUIRE_NO_THROW(create_segment_with_id(context, segmentmanager, + segment_id, size_in_bytes)); + } + + gaspi_number_t allocated_segments_num; + GPI::gaspiCheckReturn(gaspi_segment_num(&allocated_segments_num), + "get number of segments"); + BOOST_TEST_REQUIRE(allocated_segments_num == segment_ids.size()); + } + + BOOST_AUTO_TEST_CASE(segmentmanager_duplicate_segment_id) + { + auto &context = GlobalContext::instance()->gpi_cont; + GPI::SegmentID segment_id = 5; + std::size_t size_in_bytes = 1000; + GPI::SegmentManager segmentmanager(context); + + BOOST_REQUIRE_NO_THROW(create_segment_with_id(context, segmentmanager, + segment_id, size_in_bytes)); + BOOST_REQUIRE_THROW(create_segment_with_id(context, segmentmanager, + segment_id, size_in_bytes), + std::runtime_error); + } + + BOOST_AUTO_TEST_CASE(segmentmanager_delete_manager) + { + gaspi_number_t initially_allocated_segments_num; + GPI::gaspiCheckReturn(gaspi_segment_num(&initially_allocated_segments_num), + "get number of segments"); + + auto &context = GlobalContext::instance()->gpi_cont; + { + GPI::SegmentID segment_id = 0; + std::size_t size_in_bytes = 1000; + GPI::SegmentManager segmentmanager(context); + create_segment_with_id(context, segmentmanager, segment_id, size_in_bytes); + } + + // segment manager should be out of scope and all segments deallocated + gaspi_number_t allocated_segments_num; + GPI::gaspiCheckReturn(gaspi_segment_num(&allocated_segments_num), + "get number of segments"); + BOOST_TEST_REQUIRE(allocated_segments_num == initially_allocated_segments_num); + } + + BOOST_AUTO_TEST_CASE(segmentmanager_create_segment_buffer) + { + auto &context = GlobalContext::instance()->gpi_cont; + + GPI::SegmentID segment_id = 0; + GPI::SegmentManager segmentmanager(context); + std::size_t const size_in_bytes = 64; + create_segment_with_id(context, segmentmanager, segment_id, size_in_bytes); + + std::size_t const expected_first_offset = 0; + auto const segment_buffer = segmentmanager.get_buffer_of_size(segment_id, size_in_bytes); + BOOST_TEST_REQUIRE(segment_buffer.get_size() == size_in_bytes); + BOOST_TEST_REQUIRE(segment_buffer.get_offset() == expected_first_offset); + + auto const buffer_pointer = reinterpret_cast( + context.get_segment_pointer(segment_buffer.get_segment_id())); + BOOST_TEST_REQUIRE(reinterpret_cast(segment_buffer.get_ptr()) == buffer_pointer); + } + + BOOST_AUTO_TEST_CASE(segmentmanager_segment_buffer_empty) + { + auto &context = GlobalContext::instance()->gpi_cont; + GPI::SegmentID segment_id = 0; + std::size_t size_in_bytes = 1000; + GPI::SegmentManager segmentmanager(context); + create_segment_with_id(context, segmentmanager, segment_id, size_in_bytes); + + std::size_t const needed_buffer_size_in_bytes = 0; + BOOST_REQUIRE_NO_THROW(segmentmanager.get_buffer_of_size(segment_id, needed_buffer_size_in_bytes)); + } + + BOOST_AUTO_TEST_CASE(segmentmanager_segment_buffer_max_size) + { + auto &context = GlobalContext::instance()->gpi_cont; + + GPI::SegmentID segment_id = 0; + std::size_t size_in_bytes = 1000; + GPI::SegmentManager segmentmanager(context); + create_segment_with_id(context, segmentmanager, segment_id, size_in_bytes); + + BOOST_REQUIRE_NO_THROW(segmentmanager.get_buffer_of_size(segment_id, size_in_bytes)); + } + + BOOST_AUTO_TEST_CASE(segmentmanager_segment_buffer_too_large) + { + auto &context = GlobalContext::instance()->gpi_cont; + GPI::SegmentID segment_id = 0; + std::size_t size_in_bytes = 1000; + GPI::SegmentManager segmentmanager(context); + create_segment_with_id(context, segmentmanager, segment_id, size_in_bytes); + + BOOST_REQUIRE_THROW(segmentmanager.get_buffer_of_size(segment_id, size_in_bytes + 1), + std::runtime_error); + } + + BOOST_AUTO_TEST_CASE(segmentmanager_segment_buffer_beyond_max_size) + { + auto &context = GlobalContext::instance()->gpi_cont; + GPI::SegmentID segment_id = 0; + std::size_t size_in_bytes = 1000; + GPI::SegmentManager segmentmanager(context); + create_segment_with_id(context, segmentmanager, segment_id, size_in_bytes); + + segmentmanager.get_buffer_of_size(segment_id, size_in_bytes - 1); + BOOST_REQUIRE_THROW(segmentmanager.get_buffer_of_size(segment_id, size_in_bytes), + std::runtime_error); + } + + BOOST_AUTO_TEST_CASE(segmentmanager_multiple_segment_buffers) + { + std::vector const sizes_in_bytes {10, 3, 56, 100, 1}; + std::size_t total_segment_size_in_bytes = std::accumulate(sizes_in_bytes.begin(), + sizes_in_bytes.end(), 0); + + auto &context = GlobalContext::instance()->gpi_cont; + GPI::SegmentID segment_id = 0; + GPI::SegmentManager segmentmanager(context); + create_segment_with_id(context, segmentmanager, segment_id, total_segment_size_in_bytes); + + std::size_t current_offset = 0; + for (auto size_in_bytes : sizes_in_bytes) + { + auto const segment_buffer = segmentmanager.get_buffer_of_size(segment_id, size_in_bytes); + BOOST_TEST_REQUIRE(segment_buffer.get_size() == size_in_bytes); + BOOST_TEST_REQUIRE(segment_buffer.get_offset() == current_offset); + BOOST_TEST_REQUIRE(segment_buffer.get_segment_id() == segment_id); + + auto const buffer_pointer = reinterpret_cast( + context.get_segment_pointer(segment_buffer.get_segment_id())) + + current_offset; + BOOST_TEST_REQUIRE(reinterpret_cast(segment_buffer.get_ptr()) == buffer_pointer); + + current_offset += size_in_bytes; + } + } + + BOOST_AUTO_TEST_SUITE_END() +} diff --git a/test/python/CMakeLists.txt b/test/python/CMakeLists.txt new file mode 100644 index 00000000..b90484c0 --- /dev/null +++ b/test/python/CMakeLists.txt @@ -0,0 +1,34 @@ +include (add_test_wrappers) + +set(localranks_list 3) +tarantella_generate_python_gpi_test(NAME WeightsDataParallel + TEST_FILE ${CMAKE_CURRENT_SOURCE_DIR}/data_parallel_training/weights_test.py + LOCALRANKS_LIST "${localranks_list}" + LABELS integration + TIMEOUT 3600) + +set(localranks_list 3 4) +tarantella_generate_python_gpi_test(NAME AccuracyDataParallel + TEST_FILE ${CMAKE_CURRENT_SOURCE_DIR}/data_parallel_training/accuracy_test.py + LOCALRANKS_LIST "${localranks_list}" + LABELS integration long_running + TIMEOUT 10000) + +set(localranks_list 4) +tarantella_generate_python_gpi_test(NAME OptimizersDataParallelMNIST + TEST_FILE ${CMAKE_CURRENT_SOURCE_DIR}/data_parallel_training/optimizers_mnist_test.py + LOCALRANKS_LIST "${localranks_list}" + LABELS integration long_running + TIMEOUT 10000) + +set(localranks_list 4) +tarantella_generate_python_gpi_test(NAME OptimizersDataParallelCIFAR + TEST_FILE ${CMAKE_CURRENT_SOURCE_DIR}/data_parallel_training/optimizers_cifar10_test.py + LOCALRANKS_LIST "${localranks_list}" + LABELS integration long_running disabled + TIMEOUT 10000) + +tarantella_generate_python_gpi_test(NAME DistributedDatasets + LOCALRANKS_LIST "1" + TEST_FILE ${CMAKE_CURRENT_SOURCE_DIR}/datasets/distributed_dataset.py + TIMEOUT 3600) diff --git a/test/python/conftest.py b/test/python/conftest.py new file mode 100644 index 00000000..2c1fa88d --- /dev/null +++ b/test/python/conftest.py @@ -0,0 +1,36 @@ +import pytest +import logging +import os + +import tensorflow as tf + +@pytest.fixture(scope="session") +def tarantella_framework(): + os.environ['TF_CUDNN_DETERMINISTIC']='1' + + import tarantella + tarantella.init() + + logging.getLogger().info("init tarantella") + yield tarantella # provide the fixture value + logging.getLogger().info("teardown tarantella") + + + +def pytest_configure(config): + # register an additional marker + config.addinivalue_line( + "markers", "tfversion(version): test to run only on specific tf versions" + ) + + +def pytest_runtest_setup(item): + supported_versions = [mark.args[0] for mark in item.iter_markers(name="tfversion")] + if supported_versions: + supportedv = None + for v in supported_versions: + if tf.__version__.startswith(v): + supportedv = v + if not supportedv: + pytest.skip("Test does not support TF{}".format(tf.__version__)) + diff --git a/test/python/data_parallel_training/accuracy_test.py b/test/python/data_parallel_training/accuracy_test.py new file mode 100644 index 00000000..c99ef3ad --- /dev/null +++ b/test/python/data_parallel_training/accuracy_test.py @@ -0,0 +1,73 @@ +from models import mnist_models as mnist +import training_runner as base_runner +import utilities as util +import tarantella + + +import tensorflow as tf +from tensorflow import keras +import numpy as np + +import logging +import pytest + +# Run tests with multiple models as fixtures +# (reuse the same model for various test parameter combinations) +@pytest.fixture(scope="class", params=[mnist.fc_model_generator, + mnist.lenet5_model_generator, + mnist.sequential_model_generator, + mnist.subclassed_model_generator, + ]) +def model_runners(request): + tf.random.set_seed(42) + tnt_model_runner = base_runner.generate_tnt_model_runner(request.param()) + tf.random.set_seed(42) + reference_model_runner = base_runner.TrainingRunner(request.param()) + yield tnt_model_runner, reference_model_runner + +class TestsDataParallelCompareAccuracy: + + def test_initialization(self, tarantella_framework): + assert tarantella_framework + + @pytest.mark.parametrize("micro_batch_size", [32, 61]) + @pytest.mark.parametrize("number_epochs", [3]) + @pytest.mark.parametrize("nbatches", [200]) + def test_compare_accuracy_against_reference(self, tarantella_framework, model_runners, + micro_batch_size, number_epochs, nbatches): + batch_size = micro_batch_size * tarantella_framework.get_size() + nsamples = nbatches * batch_size + + tnt_model_runner, reference_model_runner = model_runners + # reuse model with its initial weights + tnt_model_runner.reset_weights() + reference_model_runner.reset_weights() + + # verify that both models have identical weights + tnt_initial_weights = tnt_model_runner.get_weights() + reference_initial_weights = reference_model_runner.get_weights() + util.compare_weights(tnt_initial_weights, reference_initial_weights, 1e-6) + + # train reference model + (ref_train_dataset, ref_test_dataset) = util.load_dataset(mnist.load_mnist_dataset, + train_size = nsamples, + train_batch_size = batch_size, + test_size = 10000, + test_batch_size = batch_size) + reference_model_runner.train_model(ref_train_dataset, number_epochs) + reference_loss_accuracy = reference_model_runner.evaluate_model(ref_test_dataset) + + # train Tarantella model + (train_dataset, test_dataset) = util.load_dataset(mnist.load_mnist_dataset, + train_size = nsamples, + train_batch_size = batch_size, + test_size = 10000, + test_batch_size = batch_size) + tnt_model_runner.train_model(train_dataset, number_epochs) + tnt_loss_accuracy = tnt_model_runner.evaluate_model(test_dataset) + + rank = tarantella_framework.get_rank() + logging.getLogger().info("[Rank %d] Tarantella[loss, accuracy] = %s" % (rank, str(tnt_loss_accuracy))) + logging.getLogger().info("[Rank %d] Reference [loss, accuracy] = %s" % (rank, str(reference_loss_accuracy))) + assert np.isclose(tnt_loss_accuracy[0], reference_loss_accuracy[0], atol=1e-2) # losses might not be identical + assert np.isclose(tnt_loss_accuracy[1], reference_loss_accuracy[1], atol=1e-2) diff --git a/test/python/data_parallel_training/optimizers_cifar10_test.py b/test/python/data_parallel_training/optimizers_cifar10_test.py new file mode 100644 index 00000000..c8441e81 --- /dev/null +++ b/test/python/data_parallel_training/optimizers_cifar10_test.py @@ -0,0 +1,50 @@ +from models import cifar10_models as cifar +import training_runner as base_runner +import utilities as util +import tarantella + +import tensorflow as tf +from tensorflow import keras + +import pytest + +# Fixture for CIFAR-10 models +@pytest.fixture(scope="class", params=[cifar.alexnet_model_generator]) +def cifar_model_runner(request): + yield base_runner.generate_tnt_model_runner(request.param()) + +class TestsDataParallelOptimizersCIFAR10: + def test_initialization(self, tarantella_framework): + assert tarantella_framework + + @pytest.mark.parametrize("optimizer", [keras.optimizers.Adadelta, + keras.optimizers.Adagrad, + keras.optimizers.Adam, + keras.optimizers.Adamax, + keras.optimizers.Nadam, + keras.optimizers.RMSprop, + keras.optimizers.SGD + ]) + @pytest.mark.parametrize("micro_batch_size", [64]) + @pytest.mark.parametrize("nbatches", [230]) + @pytest.mark.parametrize("ntest_batches", [40]) + def test_cifar_alexnet(self, tarantella_framework, cifar_model_runner, + optimizer, micro_batch_size, nbatches): + batch_size = micro_batch_size * tarantella_framework.get_size() + nsamples = nbatches * batch_size + (number_epochs, lr) = cifar.get_hyperparams(optimizer) + (train_dataset, test_dataset) = util.load_dataset(cifar.load_cifar_dataset, + train_size = nsamples, + train_batch_size = batch_size, + test_size = 10000, + test_batch_size = batch_size) + if optimizer.__name__ == 'SGD': + cifar_model_runner.compile_model(optimizer(learning_rate=lr, momentum=0.9)) + else: + cifar_model_runner.compile_model(optimizer(learning_rate=lr)) + + cifar_model_runner.reset_weights() + cifar_model_runner.train_model(train_dataset, number_epochs) + + results = cifar_model_runner.evaluate_model(test_dataset) + util.check_accuracy_greater(results[1], 0.5) diff --git a/test/python/data_parallel_training/optimizers_mnist_test.py b/test/python/data_parallel_training/optimizers_mnist_test.py new file mode 100644 index 00000000..13182f01 --- /dev/null +++ b/test/python/data_parallel_training/optimizers_mnist_test.py @@ -0,0 +1,74 @@ +from models import mnist_models as mnist +import training_runner as base_runner +import utilities as util +import tarantella + +import tensorflow as tf +from tensorflow import keras + +import pytest + +# Run tests with multiple models as fixtures +# (reuse the same model for various test parameter combinations) +# Fixture for MNIST models +@pytest.fixture(scope="class", params=[mnist.lenet5_model_generator, + mnist.sequential_model_generator + ]) +def mnist_model_runner(request): + yield base_runner.generate_tnt_model_runner(request.param()) + +class TestsDataParallelOptimizers: + def test_initialization(self, tarantella_framework): + assert tarantella_framework + + @pytest.mark.parametrize("optimizer", [keras.optimizers.Adadelta, + keras.optimizers.Adagrad, + keras.optimizers.Adam, + keras.optimizers.Adamax, + keras.optimizers.Nadam, + keras.optimizers.RMSprop, + keras.optimizers.SGD + ]) + @pytest.mark.parametrize("micro_batch_size", [64]) + @pytest.mark.parametrize("nbatches", [230]) + def test_compare_accuracy_optimizers(self, tarantella_framework, mnist_model_runner, + optimizer, micro_batch_size, nbatches): + batch_size = micro_batch_size * tarantella_framework.get_size() + nsamples = nbatches * batch_size + (number_epochs, lr) = mnist.get_hyperparams(optimizer) + (train_dataset, test_dataset) = util.load_dataset(mnist.load_mnist_dataset, + train_size = nsamples, + train_batch_size = batch_size, + test_size = 10000, + test_batch_size = batch_size) + mnist_model_runner.compile_model(optimizer(learning_rate=lr)) + mnist_model_runner.reset_weights() + mnist_model_runner.train_model(train_dataset, number_epochs) + + results = mnist_model_runner.evaluate_model(test_dataset) + util.check_accuracy_greater(results[1], 0.91) + + @pytest.mark.parametrize("lr", [0.01]) + @pytest.mark.parametrize("nesterov", [False, True]) + @pytest.mark.parametrize("momentum", [0.9]) + @pytest.mark.parametrize("micro_batch_size", [64]) + @pytest.mark.parametrize("nbatches", [230]) + @pytest.mark.parametrize("number_epochs", [8]) + def test_compare_sgd_momentum(self, tarantella_framework, mnist_model_runner, + lr, nesterov, momentum, micro_batch_size, nbatches, + number_epochs): + batch_size = micro_batch_size * tarantella_framework.get_size() + nsamples = nbatches * batch_size + (train_dataset, test_dataset) = util.load_dataset(mnist.load_mnist_dataset, + train_size = nsamples, + train_batch_size = batch_size, + test_size = 10000, + test_batch_size = batch_size) + mnist_model_runner.compile_model(keras.optimizers.SGD(learning_rate=lr, + momentum=momentum, + nesterov=nesterov)) + mnist_model_runner.reset_weights() + mnist_model_runner.train_model(train_dataset, number_epochs) + + results = mnist_model_runner.evaluate_model(test_dataset) + util.check_accuracy_greater(results[1], 0.91) diff --git a/test/python/data_parallel_training/weights_test.py b/test/python/data_parallel_training/weights_test.py new file mode 100644 index 00000000..baa7a1c7 --- /dev/null +++ b/test/python/data_parallel_training/weights_test.py @@ -0,0 +1,52 @@ +from models import mnist_models as mnist +import training_runner as base_runner +import utilities as util +import tarantella + +import tensorflow as tf +from tensorflow import keras +import numpy as np +import random + +import logging +import pytest + +# Run tests with multiple models as fixtures +# (reuse the same model for various test parameter combinations) +@pytest.fixture(scope="class", params=[mnist.lenet5_model_generator, + mnist.sequential_model_generator, + ]) +def model_runner(request): + yield base_runner.generate_tnt_model_runner(request.param()) + +class TestsDataParallelCompareWeights: + + def test_initialization(self, tarantella_framework): + assert tarantella_framework + + def test_model_initialization(self, model_runner): + assert model_runner.model + + @pytest.mark.parametrize("micro_batch_size", [64]) + @pytest.mark.parametrize("nbatches", [100]) + @pytest.mark.parametrize("number_epochs", [7]) + def test_compare_weights_across_ranks(self, tarantella_framework, model_runner, + micro_batch_size, nbatches, number_epochs): + comm_size = tarantella_framework.get_size() + batch_size = micro_batch_size * comm_size + nsamples = nbatches * batch_size + + (train_dataset, _) = util.load_dataset(mnist.load_mnist_dataset, + train_size = nsamples, + train_batch_size = batch_size, + test_size = 0, + test_batch_size = batch_size) + model_runner.reset_weights() + model_runner.train_model(train_dataset, number_epochs) + final_weights = model_runner.get_weights() + + # broadcast the weights from the master rank to all the participating ranks + model_runner.model._broadcast_weights() + + reference_rank_weights = model_runner.get_weights() + util.compare_weights(final_weights, reference_rank_weights, 1e-6) diff --git a/test/python/datasets/distributed_dataset.py b/test/python/datasets/distributed_dataset.py new file mode 100644 index 00000000..d583b530 --- /dev/null +++ b/test/python/datasets/distributed_dataset.py @@ -0,0 +1,333 @@ +import logging +import numpy as np +import pytest + +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers + +from tarantella.datasets import distributed_dataset as ds + +def mnist_as_np_arrays(training_samples): + mnist_train_size = 60000 + assert(training_samples <= mnist_train_size) + + # load given number of samples + (x_train_all, y_train_all), _ = keras.datasets.mnist.load_data() + x_train = x_train_all[:training_samples] + y_train = y_train_all[:training_samples] + + # normalization and reshape + x_train = x_train.reshape(training_samples, 28, 28, 1).astype('float32') / 255. + y_train = y_train.astype('float32') + return (x_train, y_train) + +def np_arrays_from_range(training_samples): + return (tf.range(training_samples), tf.range(training_samples)) + + +def gen_dataset_batch(dataset, batch_size, drop_remainder): + dataset = dataset.batch(batch_size, drop_remainder) + dataset = dataset.prefetch(buffer_size=2) + return dataset + +def gen_dataset_multiple_batch(dataset, batch_size, drop_remainder): + dataset = dataset.batch(2, drop_remainder = True) + dataset = dataset.batch(2, drop_remainder= True) + dataset = dataset.batch(batch_size, drop_remainder) + return dataset + +def gen_dataset_shuffle_batch(dataset, batch_size, drop_remainder): + dataset = dataset.shuffle(10, seed=44, reshuffle_each_iteration=True) + + dataset = dataset.batch(batch_size, drop_remainder) + dataset = dataset.prefetch(buffer_size=2) + return dataset + +def gen_dataset_filter(dataset, batch_size, drop_remainder): + # Read from multiple files in parallel + dataset = dataset.shuffle(10, seed=44, reshuffle_each_iteration=True) + + def pred(x,y): + return x > 100 + dataset = dataset.filter(predicate = lambda x, y: pred(x,y)) + dataset = dataset.batch(batch_size, drop_remainder) + return dataset + +def gen_dataset_flat_map(dataset, batch_size, drop_remainder): + dataset = dataset.batch(batch_size = 3, drop_remainder = False) + + # flat map works on batched datasets + dataset = dataset.flat_map(lambda x, y: tf.data.Dataset.from_tensor_slices((x, y))) + + dataset = dataset.batch(batch_size, drop_remainder) + return dataset + +def gen_dataset_interleave(dataset, batch_size, drop_remainder): + dataset = dataset.batch(batch_size = 3, drop_remainder = False) + dataset = dataset.interleave(map_func = lambda x, y: tf.data.Dataset.from_tensor_slices((x+3, y)), + cycle_length=tf.data.experimental.AUTOTUNE, + block_length=2, + deterministic = True) + + dataset = dataset.batch(batch_size, drop_remainder) + return dataset + +def gen_dataset_interleave_v1(dataset, batch_size, drop_remainder): + dataset = dataset.batch(batch_size = 3, drop_remainder = False) + dataset = dataset.interleave(map_func = lambda x, y: tf.data.Dataset.from_tensor_slices((x+3, y)), + cycle_length=tf.data.experimental.AUTOTUNE, + block_length=2) + dataset = dataset.batch(batch_size, drop_remainder) + return dataset + +def gen_dataset_map(dataset, batch_size, drop_remainder): + def map_fn(x, y): + return x*5, y + dataset = dataset.map(lambda x, y: map_fn(x, y), + deterministic = True) + dataset = dataset.batch(batch_size, drop_remainder) + return dataset + +def gen_dataset_map_v1(dataset, batch_size, drop_remainder): + def map_fn(x, y): + return x*5, y + dataset = dataset.map(lambda x, y: map_fn(x, y)) + dataset = dataset.batch(batch_size, drop_remainder) + return dataset + +def gen_dataset_padded_batch(dataset, batch_size, drop_remainder): + dataset = dataset.map(lambda x, y: tf.fill([4], x)) + dataset = dataset.padded_batch(batch_size, + drop_remainder = drop_remainder, + padded_shapes = 8) + dataset = dataset.prefetch(buffer_size=2) + return dataset + +def gen_dataset_parallel_interleave(dataset, batch_size, drop_remainder): + dataset = dataset.batch(batch_size = 3, drop_remainder = False) + dataset = dataset.interleave(map_func = lambda x, y: tf.data.Dataset.from_tensor_slices((x+3, y)), + cycle_length=tf.data.experimental.AUTOTUNE, + block_length=2, + num_parallel_calls=4, + deterministic = True) + + dataset = dataset.batch(batch_size, drop_remainder) + return dataset + +def gen_dataset_parallel_interleave_v1(dataset, batch_size, drop_remainder): + dataset = dataset.batch(batch_size = 3, drop_remainder = False) + dataset = dataset.interleave(map_func = lambda x, y: tf.data.Dataset.from_tensor_slices((x+3, y)), + cycle_length=tf.data.experimental.AUTOTUNE, + num_parallel_calls=4) + + dataset = dataset.batch(batch_size, drop_remainder) + return dataset + +def gen_dataset_parallel_map(dataset, batch_size, drop_remainder): + dataset = dataset.repeat(2) + + def map_fn(x,y): + return x*5, y+x + dataset = dataset.map(map_func = lambda x, y: map_fn(x,y), + num_parallel_calls=2, + deterministic=True) + dataset = dataset.batch(batch_size, drop_remainder) + return dataset + +def gen_dataset_parallel_map_v1(dataset, batch_size, drop_remainder): + dataset = dataset.repeat(2) + def map_fn(x,y): + return x*5, y+x + dataset = dataset.map(map_func = lambda x, y: map_fn(x,y), + num_parallel_calls=2) + dataset = dataset.batch(batch_size, drop_remainder) + return dataset + +def gen_dataset_io_pipeline(dataset, batch_size, drop_remainder): + # Read from multiple files in parallel + def parse_fn(x,y): + return x,y + + dataset = dataset.map( + map_func = lambda x, y: parse_fn(x,y)) + + dataset = dataset.cache() + # Shuffle samples + dataset = dataset.shuffle(1000, seed = 123) + dataset = dataset.repeat(2) + + # Set number of samples if specified + dataset = dataset.take(batch_size * 3) + + # Preprocess samples (in parallel) + dataset = dataset.map( + parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) + + dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) + dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) + return dataset + +def gen_dataset_concatenate(dataset, batch_size, drop_remainder): + dataset = dataset.concatenate(dataset) + dataset = dataset.batch(batch_size, drop_remainder) + return dataset + +def gen_dataset_zip(dataset, batch_size, drop_remainder): + dataset = tf.data.Dataset.zip((dataset, dataset)) + dataset = dataset.batch(batch_size, drop_remainder) + + return dataset + +def validate_local_dataset(ref_dataset, local_dataset, micro_batch_size, rank): + local_dataset_it = iter(local_dataset) + expected_dataset_it = iter(ref_dataset) + + for local_batch, expected_batch in zip(local_dataset_it, expected_dataset_it): + # look at the first dataset when datasets are nested (e.g., after zip, or (samples, targets)) + # TODO: check all elements of the tuples + while isinstance(local_batch, tuple): + local_batch = local_batch[0] + + while isinstance(expected_batch, tuple): + expected_batch = expected_batch[0] + + # extract the slice of the reference dataset that corresponds to `rank` + expected_micro_batch = expected_batch[rank * micro_batch_size: + ((rank+1) * micro_batch_size)] + assert np.array_equal(local_batch,expected_micro_batch) + + # verify that the two datasets have the same length + with pytest.raises(StopIteration): + next(local_dataset_it) + with pytest.raises(StopIteration): + next(expected_dataset_it) + +transformation_test_cases = [ gen_dataset_batch, + gen_dataset_shuffle_batch, + gen_dataset_multiple_batch, + gen_dataset_io_pipeline, + gen_dataset_filter, + gen_dataset_flat_map, + pytest.param(gen_dataset_map, + marks=pytest.mark.tfversion('2.2')), + pytest.param(gen_dataset_map_v1, + marks=[pytest.mark.tfversion('2.0'), + pytest.mark.tfversion('2.1')]), + pytest.param(gen_dataset_interleave, + marks=pytest.mark.tfversion('2.2')), + pytest.param(gen_dataset_interleave_v1, + marks=[pytest.mark.tfversion('2.0'), + pytest.mark.tfversion('2.1')]), + pytest.param(gen_dataset_parallel_interleave, + marks=pytest.mark.tfversion('2.2')), + pytest.param(gen_dataset_parallel_interleave_v1, + marks=[pytest.mark.tfversion('2.0'), + pytest.mark.tfversion('2.1')]), + pytest.param(gen_dataset_parallel_map, + marks=pytest.mark.tfversion('2.2')), + pytest.param(gen_dataset_parallel_map_v1, + marks=[pytest.mark.tfversion('2.0'), + pytest.mark.tfversion('2.1')]), + gen_dataset_padded_batch, + gen_dataset_concatenate, + gen_dataset_zip, + ] +@pytest.mark.parametrize("apply_transformations", transformation_test_cases) +@pytest.mark.parametrize("dataset_generator", [np_arrays_from_range]) +@pytest.mark.parametrize("comm_size", [1,3,4]) +@pytest.mark.parametrize("micro_batch_size", [5]) +@pytest.mark.parametrize("num_samples", [91]) +@pytest.mark.parametrize("nepochs", [2]) +def test_with_drop_remainder(apply_transformations, dataset_generator, + comm_size, micro_batch_size, num_samples, + nepochs): + batch_size = comm_size * micro_batch_size + (x_train, y_train) = dataset_generator(num_samples) + + reference_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) + tnt_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) + + tnt_dataset = apply_transformations(tnt_dataset, + batch_size = batch_size, + drop_remainder=True) + + for rank in range(comm_size): # verify each rank separately + # load local dataset for `rank` + dist_dataset = ds.DistributedDataset(tnt_dataset, + num_ranks = comm_size, + rank = rank) + local_dataset = dist_dataset.distribute_dataset_across_ranks() + micro_batch_size = dist_dataset.get_microbatch_size(batch_size) + + # rebuild reference dataset each time to prevent + # shuffling effects for repeated iterations + ref_dataset = apply_transformations(reference_dataset, + batch_size = batch_size, + drop_remainder=True) + for epoch in range(nepochs): + validate_local_dataset(ref_dataset, local_dataset, micro_batch_size, rank) + + +@pytest.mark.parametrize("apply_transformations", transformation_test_cases) +@pytest.mark.parametrize("dataset_generator", [np_arrays_from_range]) +@pytest.mark.parametrize("comm_size", [1,3,4]) +@pytest.mark.parametrize("micro_batch_size", [5]) +@pytest.mark.parametrize("num_batches", [4]) +@pytest.mark.parametrize("size_final_batch", [0, 1, 6, 11]) +def test_no_drop_remainder(apply_transformations, dataset_generator, + comm_size, micro_batch_size, num_batches, + size_final_batch): + batch_size = comm_size * micro_batch_size + num_samples = num_batches * batch_size + size_final_batch + (x_train, y_train) = dataset_generator(num_samples) + + reference_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) + tnt_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) + + # Dataset should behve like the sequential dataset with `drop_ramainder=True` + tnt_dataset = apply_transformations(tnt_dataset, + batch_size = batch_size, + drop_remainder=False) + + for rank in range(comm_size): # verify each rank separately + # load local dataset for `rank` + dist_dataset = ds.DistributedDataset(tnt_dataset, + num_ranks = comm_size, + rank = rank) + local_dataset = dist_dataset.distribute_dataset_across_ranks() + micro_batch_size = dist_dataset.get_microbatch_size(batch_size) + + # rebuild reference dataset each time to prevent + # shuffling effects for repeated iterations + ref_dataset = apply_transformations(reference_dataset, + batch_size = batch_size, + drop_remainder=True) + validate_local_dataset(ref_dataset, local_dataset, micro_batch_size, rank) + + +@pytest.mark.parametrize("apply_transformations", transformation_test_cases) +@pytest.mark.parametrize("dataset_generator", [np_arrays_from_range]) +@pytest.mark.parametrize("comm_size", [3, 4]) +@pytest.mark.parametrize("micro_batch_size", [5]) +@pytest.mark.parametrize("size_batch_remainder", [1, 7, 11]) +def test_batch_not_multiple_num_ranks(apply_transformations, dataset_generator, + comm_size, micro_batch_size, + size_batch_remainder): + batch_size = comm_size * micro_batch_size + size_batch_remainder + num_samples = 4 * batch_size + (x_train, y_train) = dataset_generator(num_samples) + + tnt_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) + tnt_dataset = apply_transformations(tnt_dataset, + batch_size = batch_size, + drop_remainder=True) + + for rank in range(comm_size): # verify each rank separately + dist_dataset = ds.DistributedDataset(tnt_dataset, + num_ranks = comm_size, + rank = rank) + # distributing the dataset should fail because the batch size is not a + # multiple of the number of ranks + with pytest.raises(ValueError): + local_dataset = dist_dataset.distribute_dataset_across_ranks() diff --git a/test/python/models/cifar10_models.py b/test/python/models/cifar10_models.py new file mode 100644 index 00000000..10bee192 --- /dev/null +++ b/test/python/models/cifar10_models.py @@ -0,0 +1,65 @@ +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers + +import numpy as np +import logging + +# Optimizer Hyperparameters +# Dictionary: Optimizer Name: (number_epochs, learning_rate) +hyperparams_cifar = {'Adadelta': (12, 1), + 'Adagrad': (20, 0.05), + 'Adam': (5, 0.001), + 'Adamax': (10, 0.001), + 'Nadam': (10, 0.0001), + 'RMSprop': (10, 0.001), + 'SGD': (20, 0.01)} + +def get_hyperparams(optimizer): + opt = optimizer.__name__ + return hyperparams_cifar.get(opt) + +# Load CIFAR-10 dataset +def load_cifar_dataset(training_samples, validation_samples, test_samples): + cifar_train_size = 60000 + cifar_test_size = 10000 + assert(training_samples + validation_samples <= cifar_train_size) + assert(test_samples <= cifar_test_size) + + # load given number of samples + (x_train_all, y_train_all), (x_test_all, y_test_all) = keras.datasets.cifar10.load_data() + x_train = x_train_all[:training_samples] + y_train = y_train_all[:training_samples] + x_val = x_train_all[training_samples:training_samples+validation_samples] + y_val = y_train_all[training_samples:training_samples+validation_samples] + x_test = x_test_all[:test_samples] + y_test = y_test_all[:test_samples] + + # Preprocess the data (these are Numpy arrays) + x_train = x_train.reshape(-1, 32, 32, 3).astype('float32') / 255 + x_test = x_test.reshape(-1, 32, 32, 3).astype('float32') / 255 + y_train = y_train.astype('float32') + y_test = y_test.astype('float32') + + return (x_train, y_train), (x_val, y_val), (x_test, y_test) + +def alexnet_model_generator(): + inputs = keras.Input(shape=(32,32,3,), name='input') + x = layers.Conv2D(96, 3, strides=(4, 4), activation='relu')(inputs) + x = layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) + x = layers.Conv2D(256, 5, padding='same', activation='relu')(x) + x = layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) + x = layers.Conv2D(384, 3, padding='same', activation='relu')(x) + x = layers.Conv2D(384, 3, padding='same', activation='relu')(x) + x = layers.Conv2D(256, 3, padding='same', activation='relu')(x) + x = layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) + x = layers.Flatten()(x) + x = layers.Dense(4096, activation='relu')(x) + x = layers.Dropout(0.4)(x) + x = layers.Dense(4096, activation='relu')(x) + x = layers.Dropout(0.4)(x) + outputs = layers.Dense(10, activation='softmax')(x) + model = keras.Model(inputs=inputs, outputs=outputs) + + logging.getLogger().info("Initialized AlexNet model") + return model diff --git a/test/python/models/mnist_models.py b/test/python/models/mnist_models.py new file mode 100644 index 00000000..402cffd5 --- /dev/null +++ b/test/python/models/mnist_models.py @@ -0,0 +1,114 @@ +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers +import numpy as np + +import logging + +# Optimizer Hyperparameters +# Dictionary: Optimizer Name: (number_epochs, learning_rate) +hyperparams_mnist = {'Adadelta': (1, 1), + 'Adagrad': (3, 0.01), + 'Adam': (1, 0.001), + 'Adamax': (2, 0.001), + 'Nadam': (1, 0.002), + 'RMSprop': (1, 0.001), + 'SGD': (8, 0.01)} + +def get_hyperparams(optimizer): + opt = optimizer.__name__ + return hyperparams_mnist.get(opt) + +# Load MNIST dataset +def load_mnist_dataset(training_samples, validation_samples, test_samples): + mnist_train_size = 60000 + mnist_test_size = 10000 + assert(training_samples + validation_samples <= mnist_train_size) + assert(test_samples <= mnist_test_size) + + # load given number of samples + (x_train_all, y_train_all), (x_test_all, y_test_all) = keras.datasets.mnist.load_data() + x_train = x_train_all[:training_samples] + y_train = y_train_all[:training_samples] + x_val = x_train_all[training_samples:training_samples+validation_samples] + y_val = y_train_all[training_samples:training_samples+validation_samples] + x_test = x_test_all[:test_samples] + y_test = y_test_all[:test_samples] + + # normalization and reshape + x_train = x_train.reshape(training_samples, 28, 28, 1).astype('float32') / 255. + x_val = x_val.reshape(validation_samples, 28, 28, 1).astype('float32') / 255. + x_test = x_test.reshape(test_samples, 28, 28, 1).astype('float32') / 255. + y_train = y_train.astype('float32') + y_val = y_val.astype('float32') + y_test = y_test.astype('float32') + + return (x_train, y_train), (x_val, y_val), (x_test, y_test) + +def fc_model_generator(): + inputs = keras.Input(shape=(28,28,1,), name='input') + x = layers.Flatten()(inputs) + x = layers.Dense(200, activation='relu', name='FC1')(x) + x = layers.Dense(200, activation='relu', name='FC2')(x) + outputs = layers.Dense(10, activation='softmax', name='softmax')(x) + model = keras.Model(inputs=inputs, outputs=outputs) + logging.getLogger().info("Initialized FC model") + return model + +def lenet5_model_generator(): + inputs = keras.Input(shape=(28,28,1,), name='input') + x = layers.Conv2D(20, 5, padding="same", activation='relu')(inputs) + x = layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) + x = layers.Conv2D(50, 5, padding="same", activation='relu')(x) + x = layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x) + x = layers.Flatten()(x) + x = layers.Dense(500, activation='relu')(x) + outputs = layers.Dense(10, activation='softmax')(x) + model = keras.Model(inputs=inputs, outputs=outputs) + logging.getLogger().info("Initialized LeNet5 model") + return model + +def sequential_model_generator(): + model = keras.Sequential() + model.add(keras.layers.Flatten(input_shape=(28,28,1,))) + model.add(layers.Dense(200, activation='relu', name='FC1')) + model.add(layers.Dense(200, activation='relu', name='FC2')) + model.add(layers.Dense(10, activation='softmax', name='softmax')) + + logging.getLogger().info("Initialized Sequential model") + return model + +def alexnet_model_generator(): + inputs = keras.Input(shape=(28,28,1,), name='input') + x = layers.Conv2D(32, 3, strides=(1, 1), padding='valid', activation='relu')(inputs) + x = layers.MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='valid')(x) + x = layers.Conv2D(32, 3, strides=(1, 1), padding='valid', activation='relu')(x) + x = layers.MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='valid')(x) + x = layers.Conv2D(64, 3, strides=(1, 1), padding='valid', activation='relu')(x) + x = layers.Conv2D(64, 3, strides=(1, 1), padding='valid', activation='relu')(x) + x = layers.MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='valid')(x) + x = layers.Flatten()(x) + x = layers.Dense(512, activation='relu')(x) + outputs = layers.Dense(10, activation='softmax')(x) + model = keras.Model(inputs=inputs, outputs=outputs) + + logging.getLogger().info("Initialized AlexNet model") + return model + +class SubclassedModel(tf.keras.Model): + def __init__(self): + super(SubclassedModel, self).__init__() + self.flatten = keras.layers.Flatten(input_shape=(28,28,1,)) + self.dense = keras.layers.Dense(200, activation='relu', name='FC') + self.classifier = keras.layers.Dense(10, activation='softmax', name='softmax') + logging.getLogger().info("Initialized SubclassedModel") + + def call(self, inputs): + x = self.flatten(inputs) + x = self.dense(x) + return self.classifier(x) + +def subclassed_model_generator(): + model = SubclassedModel() + model.build((None,28,28,1)) + return model \ No newline at end of file diff --git a/test/python/pytest.ini b/test/python/pytest.ini new file mode 100644 index 00000000..86d2db5e --- /dev/null +++ b/test/python/pytest.ini @@ -0,0 +1,7 @@ +[pytest] +filterwarnings = + ignore::DeprecationWarning + ignore::PendingDeprecationWarning + +log_cli=true +log_level=2 diff --git a/test/python/training_runner.py b/test/python/training_runner.py new file mode 100644 index 00000000..757033aa --- /dev/null +++ b/test/python/training_runner.py @@ -0,0 +1,45 @@ +import tensorflow as tf +from tensorflow import keras + +import tarantella as tnt + +def generate_tnt_model_runner(model): + model_data_par = tnt.Model(model) + runner = TrainingRunner(model_data_par) + return runner + +# Wrap tarantella model creation and compiling, as they should be executed only once +class TrainingRunner: + def __init__(self, model): + self.learning_rate = 0.001 + self.optimizer = keras.optimizers.Adam(learning_rate=self.learning_rate) + self.loss = keras.losses.SparseCategoricalCrossentropy() + self.metric = keras.metrics.SparseCategoricalAccuracy() + self.model = model + + self.compile_model(self.optimizer) + self.initial_weights = model.get_weights() + + def compile_model(self, optimizer): + self.model.compile(optimizer=optimizer, + loss=self.loss, + metrics=[self.metric], + experimental_run_tf_function=False) + + def train_model(self, train_dataset, number_epochs): + self.model.fit(train_dataset, + epochs = number_epochs, + verbose = 0, + shuffle = False) + + def get_weights(self): + return self.model.get_weights() + + def reset_weights(self): + self.model.set_weights(self.initial_weights) + + def evaluate_model(self, val_dataset): + #return_dict to be added here (support only from tf 2.2) + results = self.model.evaluate(val_dataset, verbose=0) + return results + diff --git a/test/python/utilities.py b/test/python/utilities.py new file mode 100644 index 00000000..eef93252 --- /dev/null +++ b/test/python/utilities.py @@ -0,0 +1,35 @@ +import datetime +import tensorflow as tf +import numpy as np +import logging + +def create_dataset_from_arrays(samples, labels, batch_size): + assert(len(samples) == len(labels)) + ds = tf.data.Dataset.from_tensor_slices((samples, labels)) + return ds.batch(batch_size) + +def load_dataset(dataset_loader, + train_size, train_batch_size, + test_size, test_batch_size): + shuffle_seed = current_date() + + (x_train, y_train), (x_val, y_val), (x_test, y_test) = dataset_loader(train_size, 0, test_size) + train_dataset = create_dataset_from_arrays(x_train, y_train, train_batch_size) + test_dataset = create_dataset_from_arrays(x_test, y_test, test_batch_size) + + train_dataset = train_dataset.shuffle(len(x_train), shuffle_seed, reshuffle_each_iteration = True) + return (train_dataset, test_dataset) + +def current_date(): + date = datetime.datetime.now() + return int(date.strftime("%Y%m%d")) + +def check_accuracy_greater(accuracy, acc_value): + logging.getLogger().info("Test accuracy: " % accuracy) + assert accuracy > acc_value + +def compare_weights(weights1, weights2, tolerance): + wtocompare = list(zip(weights1, weights2)) + for (tensor1, tensor2) in wtocompare: + assert np.allclose(tensor1, tensor2, atol=tolerance) + diff --git a/test/utilities.hpp b/test/utilities.hpp new file mode 100644 index 00000000..3e02300d --- /dev/null +++ b/test/utilities.hpp @@ -0,0 +1,28 @@ +#include "collectives/TensorInfo.hpp" +#include "gpi/Types.hpp" + +#include +#include + +namespace tarantella +{ + std::vector gen_group_ranks(size_t nranks_in_group) + { + std::vector group_ranks(nranks_in_group); + std::iota(group_ranks.begin(), group_ranks.end(), 0); + return group_ranks; + } +} + +namespace std +{ + std::ostream& operator<< (std::ostream& os, const std::vector& tlist) + { + for (auto& tinfo : tlist) + { + os << "TensorID=" << tinfo.get_id() << " nelems=" << tinfo.get_nelems() + << " dtype_size=" << getDataTypeSize(tinfo.get_elem_type())<< std::endl; + } + return os; + } +} \ No newline at end of file