diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 00000000..23c6316b
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,47 @@
+cmake_minimum_required(VERSION 3.8)
+
+set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
+
+project(tarantella VERSION 0.6.0)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_FLAGS "-O3 -Wall -Wextra -Werror")
+
+option(LINK_IB "Defines whether to link against Infiniband drivers [default: disabled]" off)
+option(ENABLE_TESTING "Compile tests [default: disabled]" off)
+option(BUILD_DOCS "Build documentation [default: disabled]" off)
+
+set(SRC_DIR "${CMAKE_SOURCE_DIR}/src")
+set(CMAKE_BUILD_DIR "${CMAKE_SOURCE_DIR}/build")
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+set(INSTALL_LIB_DIR "${CMAKE_INSTALL_PREFIX}/lib/tarantella")
+set(INSTALL_BIN_DIR "${CMAKE_INSTALL_PREFIX}/bin")
+
+find_package(GPI2 REQUIRED)
+find_package(pybind11 REQUIRED)
+find_package(Tensorflow REQUIRED)
+
+add_subdirectory(${SRC_DIR})
+add_subdirectory(${SRC_DIR}/gpi_comm_lib/gpi)
+add_subdirectory(${SRC_DIR}/gpi_comm_lib/collectives)
+add_subdirectory(${SRC_DIR}/gpi_comm_lib)
+add_subdirectory(${SRC_DIR}/gpi_comm_lib/tf_ops)
+
+if (BUILD_DOCS)
+  find_package(Sphinx)
+  add_subdirectory(docs)
+endif()
+
+if (ENABLE_TESTING)
+  find_package(Boost 1.61 REQUIRED COMPONENTS
+                                   unit_test_framework)
+  find_package(PythonModules REQUIRED COMPONENTS 
+                                      numpy
+                                      pytest)
+  enable_testing()
+  set(SLEEP_TIME_AFTER_TEST 4)
+  add_subdirectory(${CMAKE_SOURCE_DIR}/test)
+endif() 
+
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..67da183f
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,101 @@
+TARANTELLA END USER LICENSE AGREEMENT
+October 21, 2020
+
+PLEASE READ THIS LICENSE AGREEMENT CAREFULLY. BY USING THE SOFTWARE TARANTELLA YOU
+ACCEPT ALL TERMS OF THE LICENSE AGREEMENT. IF YOU DO NOT AGREE TO THE TERMS OF
+THIS LICENSE, DO NOT INSTALL, COPY, OR USE THE SOFTWARE.
+
+1.) DEFINITIONS
+
+1.1) LICENSOR: Fraunhofer Gesellschaft zur Foerderung der angewandten Forschung
+e.V., Hansastr. 27c, 80686 Muenchen, Germany, as legal entity of Fraunhofer-
+Institut fuer Techno- und Wirtschaftsmathematik, Fraunhofer-Platz 1,
+67663 Kaiserslautern, Germany.
+
+1.2) LICENSEE: The user of Tarantella under this License Agreement.
+
+1.3) LICENSED SOFTWARE: The Software Tarantella in source code and object code form
+including all executable programs.
+
+1.4) DOCUMENTATION: The Tarantella documentation, user's guide, e-mails and other explanatory
+materials accompanying the LICENSED SOFTWARE in printed or electronic form.
+
+2.) OWNERSHIP / INTELLECTUAL PROPERTY RIGHTS
+
+LICENSEE acknowledges that ownership and all intellectual property rights
+related to the LICENSED SOFTWARE and to the DOCUMENTATION, including patents,
+copyright, company or trade secrets remain with the LICENSOR.
+
+LICENSEE promises to keep and not to modify the copyright notices of the
+LICENSOR.
+
+3.) SCOPE OF LICENSE
+
+3.1) Provided LICENSEE accepts all terms of this License Agreement, LICENSEE
+is granted a non-exclusive, non-assignable right to use the LICENSED SOFTWARE,
+which means LICENSEE may use the software for an unrestricted number of users,
+as well as use the accompanying DOCUMENTATION by the actual number of users.
+
+3.2) Without prior written consent of LICENSOR or an authorized partner,
+LICENSEE may modify the source code and use the modified version of the LICENSED
+SOFTWARE for internal use only.
+
+3.2.1) LICENSEE must inform users of modified versions about the fact that the
+software differs from the original version.
+
+3.2.2) The LICENSED SOFTWARE and the modifications generated by LICENSEE shall
+remain the property of LICENSOR and no rights, including but not limited to the
+right to apply for industrial property rights, are granted to LICENSEE.
+
+3.3) Without prior written consent of LICENSOR or an authorized partner,
+LICENSEE may not:
+- use, copy or distribute the LICENSED SOFTWARE except as provided for under
+  sections 3.1 and 3.2.
+- provide commercial turn-key solutions based on the LICENSED SOFTWARE or
+  commercial services for the LICENSED SOFTWARE to any third party.
+- rent or lease the LICENSED SOFTWARE and DOCUMENTATION to any third party.
+- modify, adapt, or translate the LICENSED SOFTWARE for any third party.
+
+3.4) The license under this License Agreement relates to the LICENSED SOFTWARE.
+
+4.) LIMITED WARRANTY AND LIABILITY
+
+4.1) LICENSOR confirms that the LICENSED SOFTWARE has been developed without
+infringement of any rights of third parties, in particular patents, copyrights
+or other intellectual property rights of third parties. Nevertheless LICENSOR
+does not warrant that the use of the LICENSED SOFTWARE by LICENSEE does not
+infringe any third party intellectual property rights.
+
+4.2) LICENSEE is aware that there is a risk that the LICENSED SOFTWARE might
+damage the data or the computer of the LICENSEE or even other computers on the
+network in unpredictable ways. The use of the LICENSED SOFTWARE is at the
+exclusive risk of the LICENSEE. LICENSOR does not offer any warranty either
+expressed or implied and is not liable for any damages resulting from the use of
+the LICENSED SOFTWARE or DOCUMENTATION such as, but not limited to, data loss.
+
+4.3) Notwithstanding sections 4.1 and 4.2, the liability of the LICENSOR, its
+legal representatives and employees resulting from breach of duty or tort is
+restricted to damages caused intentionally or by gross negligence. In any case,
+the liability under this section is limited by typical, foreseeable, direct
+damages. The liability is unrestricted for damages of the body, life or health.
+
+5.) MISCELLANEOUS
+
+This License Agreement in English is the original one. The terms of this
+Agreement can only be modified or amended in writing. In case of interpretation
+controversies the terms of this Agreement shall prevail over the respective
+terms of any other agreements.
+
+This Agreement is construed under the Law of the Federal Republic of Germany.
+Therefore, any and all controversies resulting out of this Agreement shall be
+resolved under the Law of the Federal Republic of Germany excluding the German
+International Private Law Rules. The application of the UN-Convention of the
+International Sales of Goods (CISG) is explicitly excluded. Exclusive venue of
+jurisdiction for both parties shall be Munich, Germany.
+
+In case that one or several of the terms of this Agreement should be or become
+invalid or unenforceable, the validity of the other terms shall remain
+unaffected. In such a case, the parties shall replace the invalid or
+unenforceable condition by another legally effective provision meeting the
+purpose of the abolished provision to the greatest extent. The same applies in
+case of a gap of regulation.
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..86c77563
--- /dev/null
+++ b/README.md
@@ -0,0 +1,41 @@
+![Tarantella](docs/source/pics/tnt_logo_text.png)
+
+<br/><br/>
+
+Tarantella is an open-source, distributed Deep Learning framework built on top of TensorFlow 2,
+providing scalable Deep Neural Network training on CPU and GPU compute clusters.
+
+Tarantella is easy-to-use, allows to re-use existing TensorFlow 2/Keras models,
+and does not require any knowledge of parallel computing.
+
+
+## Goals
+
+Tarantella is designed to meet the following goals:
+
+* strong scalability
+* ease of use
+* synchronous training scheme
+* seamless integration with existing Keras models
+* support for GPU and CPU systems
+
+## Install
+
+To build Tarantella from source, the following dependencies are required:
+
+* [TensorFlow 2](https://www.tensorflow.org/install) (supported versions TF2.2, TF2.1, TF2.0)
+* [GPI-2](https://github.com/cc-hpc-itwm/GPI-2) (version 1.4.0)
+* [pybind11](https://github.com/pybind/pybind11) (from version 2.4.3)
+* C++ compiler (e.g., `gcc` from version 7.4.0)
+* CMake (from version 3.8)
+
+Detailed installation instructions can be found in the [technical docs](https://tarantella.readthedocs.io/en/latest/installation.html).
+
+## Resources
+
+* [Official website](https://www.tarantella.org)
+* [Technical documentation](https://tarantella.readthedocs.io/en/latest)
+
+## License
+
+[License](LICENSE)
diff --git a/cmake/FindDNNL.cmake b/cmake/FindDNNL.cmake
new file mode 100644
index 00000000..1f015465
--- /dev/null
+++ b/cmake/FindDNNL.cmake
@@ -0,0 +1,37 @@
+# Finds Intel DNNL library
+# Martin Kuehn May 2020
+
+find_path(DNNL_INCLUDE_DIR
+          NAMES dnnl.hpp
+          PATHS ${DNNL_ROOT}
+                ENV DNNL_ROOT
+                ${DNNL_ROOT_DIR}
+                ENV DNNL_ROOT_DIR
+          PATH_SUFFIXES include
+          DOC "DNNL header files"
+)
+
+find_library(DNNL_LIBRARY dnnl
+             PATHS ${DNNL_ROOT}
+                   ENV DNNL_ROOT
+                   ${DNNL_ROOT_DIR}
+                   ENV DNNL_ROOT_DIR
+             PATH_SUFFIXES lib lib64
+             DOC "DNNL library files")
+
+#include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args(DNNL
+                                  DEFAULT_MSG
+                                  DNNL_LIBRARY
+                                  DNNL_INCLUDE_DIR)
+          
+mark_as_advanced(DNNL_INCLUDE_DIR DNNL_LIBRARY)
+
+set(DNNL_INCLUDE_DIRS ${DNNL_INCLUDE_DIR})
+set(DNNL_LIBRARIES ${DNNL_LIBRARY})
+
+if(DNNL_FOUND AND NOT TARGET dnnl)
+    add_library(dnnl SHARED IMPORTED GLOBAL)
+    target_include_directories(dnnl INTERFACE ${DNNL_INCLUDE_DIRS})
+    set_property(TARGET dnnl PROPERTY IMPORTED_LOCATION ${DNNL_LIBRARIES})
+endif()
diff --git a/cmake/FindGPI2.cmake b/cmake/FindGPI2.cmake
new file mode 100644
index 00000000..d4bd3360
--- /dev/null
+++ b/cmake/FindGPI2.cmake
@@ -0,0 +1,133 @@
+
+#[=======================================================================[.rst:
+FindGPI2
+-------
+
+Finds the GPI2 library.
+
+Imported Targets
+^^^^^^^^^^^^^^^^
+
+This module provides the following imported targets, if found:
+
+``GPI2::GPI2``
+  The GPI2 library
+
+Result Variables
+^^^^^^^^^^^^^^^^
+
+This will define the following variables:
+
+``GPI2_FOUND``
+  True if the system has the GPI2 library.
+``GPI2_INCLUDE_DIRS``
+  Include directories needed to use GPI2.
+``GPI2_LIBRARIES``
+  Libraries needed to link to GPI2.
+``GPI2_DBG_LIBRARIES``
+  Libraries needed to link to the Debug version of GPI2.
+``GPI2_GASPI_RUN``
+  Path to ``gaspi_run``.
+
+Cache Variables
+^^^^^^^^^^^^^^^
+
+The following cache variables may also be set:
+
+``GPI2_INCLUDE_DIR``
+  The directory containing ``gaspi.h``.
+``GPI2_LIBRARY``
+  The path to the GPI2 library.
+
+#]=======================================================================]
+
+set(GPI2_LIBRARY_NAME "GPI2")
+set(GPI2_DBG_LIBRARY_NAME "GPI2-dbg")
+
+FIND_PROGRAM(GASPIRUN_PATH gaspi_run
+	PATHS
+      $ENV{PATH}
+      $ENV{LIB_DIR}/bin
+      /usr/local/bin/
+      /usr/bin/
+      )
+          
+IF (GASPIRUN_PATH) 
+      get_filename_component(GASPIRUN_FOUND_HOME ${GASPIRUN_PATH} DIRECTORY)
+      get_filename_component(GPI2_INSTALLED_PATH ${GASPIRUN_FOUND_HOME} DIRECTORY)
+      get_filename_component(GPI2_INSTALLED_PATH ${GPI2_INSTALLED_PATH} REALPATH)
+ENDIF(GASPIRUN_PATH) 
+
+find_path (GPI2_INCLUDE_DIR GASPI.h
+              PATHS ${GPI2_DEFAULT_PATH} ${GPI2_INSTALLED_PATH}
+              PATHS ENV LD_LIBRARY_PATH DYLD_LIBRARY_PATH
+              PATH_SUFFIXES include)
+
+find_library (GPI2_DBG_LIBRARY ${GPI2_DBG_LIBRARY_NAME}
+              PATHS ${GPI2_DEFAULT_PATH} ${GPI2_INSTALLED_PATH}
+              PATHS ENV LD_LIBRARY_PATH DYLD_LIBRARY_PATH
+              PATH_SUFFIXES lib lib64)
+
+find_library (GPI2_LIBRARY ${GPI2_LIBRARY_NAME}
+              PATHS ${GPI2_DEFAULT_PATH} ${GPI2_INSTALLED_PATH}
+              PATHS ENV LD_LIBRARY_PATH DYLD_LIBRARY_PATH
+              PATH_SUFFIXES lib lib64)
+
+if (GPI2_DBG_LIBRARY)
+    message(STATUS "GPI2-dbg library path: ${GPI2_DBG_LIBRARY}" )       
+else(GPI2_DBG_LIBRARY)
+    message(STATUS "GPI2-dbg  library path: not found" )
+endif()
+
+
+if (GPI2_LIBRARY)
+    message(STATUS "GPI2 library path: ${GPI2_LIBRARY}" )    
+else(GPI2_LIBRARY)
+    message(STATUS "GPI2 library path: not found" )
+endif()
+
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set GPI2_FOUND to TRUE
+# if all listed variables are TRUE
+find_package_handle_standard_args(GPI2 DEFAULT_MSG
+                                  GASPIRUN_PATH
+                                  GPI2_DBG_LIBRARY GPI2_LIBRARY)
+
+mark_as_advanced(GPI2_INCLUDE_DIR GASPIRUN_PATH
+                 GPI2_DBG_LIBRARY GPI2_LIBRARY)
+set(GPI2_INCLUDE_DIRS ${GPI2_INCLUDE_DIR} )
+set(GPI2_DBG_LIBRARIES ${GPI2_DBG_LIBRARY} )
+set(GPI2_LIBRARIES ${GPI2_LIBRARY} )
+set(GPI2_GASPI_RUN ${GASPIRUN_PATH})
+
+message(STATUS "Found GPI2: " ${GPI2_FOUND})
+
+if(GPI2_FOUND AND NOT TARGET GPI2::GPI2)
+    set(THREADS_PREFER_PTHREAD_FLAG ON)
+    find_package(Threads REQUIRED)
+    add_library(GPI2::GPI2 SHARED IMPORTED GLOBAL)
+    target_link_libraries(GPI2::GPI2 INTERFACE Threads::Threads)
+    target_include_directories(GPI2::GPI2 INTERFACE ${GPI2_INCLUDE_DIRS})
+    set_property(TARGET GPI2::GPI2 PROPERTY IMPORTED_LOCATION ${GPI2_LIBRARIES})
+
+    add_library(GPI2::GPI2dbg SHARED IMPORTED GLOBAL)
+    target_link_libraries(GPI2::GPI2dbg INTERFACE Threads::Threads)
+    target_include_directories(GPI2::GPI2dbg INTERFACE ${GPI2_INCLUDE_DIRS})
+    set_property(TARGET GPI2::GPI2dbg  PROPERTY IMPORTED_LOCATION ${GPI2_DBG_LIBRARIES})
+
+    if (LINK_IB)
+        find_package(IBverbs)
+
+        if (IBverbs_FOUND)
+            message (STATUS "GPI2: linking against ibverbs")
+            target_link_libraries(GPI2::GPI2 INTERFACE IBverbs::IBverbs)
+            target_link_libraries(GPI2::GPI2dbg INTERFACE IBverbs::IBverbs)
+        else()
+            message (FATAL_ERROR "GPI2: could not find ibverbs, disable Infiniband \
+                                  support (-DLINK_IB=OFF) to load GPI-2")
+        endif()
+    else()
+        message (STATUS "GPI2: loading library without Infiniband support")
+    endif()
+endif()
diff --git a/cmake/FindIBverbs.cmake b/cmake/FindIBverbs.cmake
new file mode 100644
index 00000000..aeb205e6
--- /dev/null
+++ b/cmake/FindIBverbs.cmake
@@ -0,0 +1,61 @@
+
+#[=======================================================================[.rst:
+FindIBverbs
+-------
+
+Finds the IBverbs library.
+
+Imported Targets
+^^^^^^^^^^^^^^^^
+
+This module provides the following imported targets, if found:
+
+``IBverbs::IBverbs``
+  The IBverbs library
+
+Result Variables
+^^^^^^^^^^^^^^^^
+
+This will define the following variables:
+
+``IBverbs_FOUND``
+  True if the system has the IBverbs library.
+``IBverbs_INCLUDE_DIRS``
+  Include directories needed to use IBverbs.
+``IBverbs_LIBRARIES``
+  Libraries needed to link to IBverbs.
+
+Cache Variables
+^^^^^^^^^^^^^^^
+
+The following cache variables may also be set:
+
+``IBverbs_INCLUDE_DIR``
+  The directory containing the public headers.
+``IBverbs_LIBRARY``
+  The path to the IBverbs library.
+
+#]=======================================================================]
+
+find_path(IBverbs_INCLUDE_DIR
+  NAMES infiniband/verbs.h
+  )
+
+find_library(IBverbs_LIBRARY
+  NAMES ibverbs)
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set IBverbs_FOUND to TRUE
+# if all listed variables are TRUE
+find_package_handle_standard_args(IBverbs DEFAULT_MSG
+                                  IBverbs_INCLUDE_DIR IBverbs_LIBRARY)
+
+mark_as_advanced(IBverbs_INCLUDE_DIR IBverbs_LIBRARY)
+set(IBverbs_LIBRARIES ${IBverbs_LIBRARY})
+set(IBverbs_INCLUDE_DIRS ${IBverbs_INCLUDE_DIR})
+
+if(IBverbs_FOUND AND NOT TARGET IBverbs::IBverbs)
+    add_library(IBverbs::IBverbs SHARED IMPORTED GLOBAL)
+    target_include_directories(IBverbs::IBverbs INTERFACE ${IBverbs_INCLUDE_DIRS})
+    set_property(TARGET IBverbs::IBverbs PROPERTY IMPORTED_LOCATION ${IBverbs_LIBRARIES})
+endif()
diff --git a/cmake/FindPythonModules.cmake b/cmake/FindPythonModules.cmake
new file mode 100644
index 00000000..3cb0ed11
--- /dev/null
+++ b/cmake/FindPythonModules.cmake
@@ -0,0 +1,60 @@
+#[=======================================================================[.rst:
+FindPythonModules
+-------
+
+Finds installed PythonModules
+
+Result Variables
+^^^^^^^^^^^^^^^^
+
+This will define the following variables:
+
+``PythonModules_FOUND``
+  True if all the required PythonModules could be loaded.
+``PythonModules_modulename_FOUND``
+  True if `modulename` could be loaded.
+``Python_EXECUTABLE``
+  Path to the Python executable.
+
+Cache Variables
+^^^^^^^^^^^^^^^
+
+The following cache variables may also be set:
+
+``GPI2_INCLUDE_DIR``
+  The directory containing ``gaspi.h``.
+``GPI2_LIBRARY``
+  The path to the GPI2 library.
+
+#]=======================================================================]
+
+execute_process(COMMAND sh -c "which python"
+                OUTPUT_VARIABLE python_path
+                RESULT_VARIABLE result
+                ERROR_QUIET
+                OUTPUT_STRIP_TRAILING_WHITESPACE)
+if (result EQUAL "0" AND EXISTS ${python_path})
+  set(Python_EXECUTABLE "${python_path}")
+endif()
+
+set(PythonModules_FOUND TRUE)
+if (Python_EXECUTABLE)
+  foreach (module IN LISTS PythonModules_FIND_COMPONENTS)
+    execute_process(COMMAND ${Python_EXECUTABLE} -c
+      "import ${module}"
+      RESULT_VARIABLE result
+      ERROR_QUIET OUTPUT_QUIET)
+
+    if(result)
+      set (PythonModules_${module}_FOUND FALSE)
+      set (PythonModules_FOUND FALSE)
+    else()
+      set (PythonModules_${module}_FOUND TRUE)
+    endif()
+  endforeach()
+endif()
+
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (PythonModules
+  REQUIRED_VARS Python_EXECUTABLE PythonModules_FOUND
+  HANDLE_COMPONENTS)
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
new file mode 100644
index 00000000..406dc8bb
--- /dev/null
+++ b/cmake/FindSphinx.cmake
@@ -0,0 +1,16 @@
+include(FindPackageHandleStandardArgs)
+
+find_program(Sphinx_EXECUTABLE
+             NAMES sphinx-build sphinx-build2
+             DOC "Path to sphinx-build executable")
+
+find_package_handle_standard_args(Sphinx REQUIRED_VARS Sphinx_EXECUTABLE)
+
+if (Sphinx_FOUND)
+  mark_as_advanced(Sphinx_EXECUTABLE)
+endif()
+
+if (Sphinx_FOUND AND NOT TARGET Sphinx::Sphinx)
+  add_executable(Sphinx::Sphinx IMPORTED)
+  set_property(TARGET Sphinx::Sphinx PROPERTY IMPORTED_LOCATION ${Sphinx_EXECUTABLE})
+endif()
diff --git a/cmake/FindTensorflow.cmake b/cmake/FindTensorflow.cmake
new file mode 100644
index 00000000..4afa4616
--- /dev/null
+++ b/cmake/FindTensorflow.cmake
@@ -0,0 +1,106 @@
+
+#[=======================================================================[.rst:
+FindTensorflow
+-------
+
+Finds the Tensorflow package as described in:
+https://www.tensorflow.org/guide/create_op#compile_the_op_using_your_system_compiler_tensorflow_binary_installation
+
+
+Imported Targets
+^^^^^^^^^^^^^^^^
+
+This module provides the following imported targets, if found:
+
+``Tensorflow::Tensorflow``
+  The Tensorflow library.
+  The target will set the CXX11_ABI_FLAG according to the ABI used to compile the TensorFlow library.
+
+Result Variables
+^^^^^^^^^^^^^^^^
+
+This will define the following variables:
+
+``Tensorflow_FOUND``
+  True if the system has the Tensorflow library.
+``Tensorflow_INCLUDE_DIRS``
+  Include directories needed to use Tensorflow.
+``Tensorflow_LIBRARIES``
+  Libraries needed to link to Tensorflow.
+
+Cache Variables
+^^^^^^^^^^^^^^^
+
+The following cache variables may also be set:
+
+``Tensorflow_INCLUDE_DIR``
+  The directory containing the Tensorflow library headers.
+``Tensorflow_LIBRARY``
+  The path to the Tensorflow library.
+
+#]=======================================================================]
+
+execute_process(COMMAND sh -c "which python"
+                OUTPUT_VARIABLE python_path
+                RESULT_VARIABLE result
+                ERROR_QUIET
+                OUTPUT_STRIP_TRAILING_WHITESPACE)
+if (result EQUAL "0" AND EXISTS ${python_path})
+  set(Python_EXECUTABLE "${python_path}")
+endif()
+
+if (Python_EXECUTABLE)
+  execute_process(COMMAND ${Python_EXECUTABLE} -c
+    "import tensorflow as tf; print(tf.sysconfig.get_lib())"
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE result_tf_lib
+    OUTPUT_VARIABLE Tensorflow_LIBRARY_DIR
+    ERROR_QUIET)
+
+  execute_process(COMMAND ${Python_EXECUTABLE} -c
+    "import tensorflow as tf; print(tf.sysconfig.get_include())"
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE result_tf_incl
+    OUTPUT_VARIABLE Tensorflow_INCLUDE_DIR
+    ERROR_QUIET)
+
+  execute_process(COMMAND ${Python_EXECUTABLE} -c
+    "import tensorflow as tf; print(tf.sysconfig.CXX11_ABI_FLAG)"
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE result_tf_abi_flag
+    OUTPUT_VARIABLE Tensorflow_CXX11_ABI_FLAG
+    ERROR_QUIET)
+endif()
+
+set(Tensorflow_LIBRARY_NAME libtensorflow_framework.so.2)
+find_library (Tensorflow_LIBRARY ${Tensorflow_LIBRARY_NAME}
+              PATHS ${Tensorflow_LIBRARY_DIR}
+              PATHS ENV LD_LIBRARY_PATH DYLD_LIBRARY_PATH)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Tensorflow DEFAULT_MSG 
+                                  Tensorflow_LIBRARY
+                                  Tensorflow_INCLUDE_DIR)
+
+mark_as_advanced(Tensorflow_INCLUDE_DIR Tensorflow_LIBRARY)
+set(Tensorflow_INCLUDE_DIRS ${Tensorflow_INCLUDE_DIR} )
+set(Tensorflow_LIBRARIES ${Tensorflow_LIBRARY} )
+
+message(STATUS "Found Tensorflow: " ${Tensorflow_FOUND})
+
+if(Tensorflow_FOUND AND NOT TARGET tensorflow_framework)
+    add_library(Tensorflow::Tensorflow SHARED IMPORTED GLOBAL)
+    target_include_directories(Tensorflow::Tensorflow INTERFACE ${Tensorflow_INCLUDE_DIRS})
+    set_property(TARGET Tensorflow::Tensorflow PROPERTY IMPORTED_LOCATION ${Tensorflow_LIBRARIES})
+
+    # Enable libraries that link against the TensorFlow library to use 
+    # the correct value of the CXX11_ABI_FLAG.
+    # E.g., the official pip TensorFlow packages require CXX11_ABI_FLAG=0,
+    # whereas the conda packages set CXX11_ABI_FLAG=1.
+    if ("${result_tf_abi_flag}" EQUAL "0")
+      target_compile_definitions(Tensorflow::Tensorflow INTERFACE _GLIBCXX_USE_CXX11_ABI=${Tensorflow_CXX11_ABI_FLAG})
+    endif()
+endif()
+
+
+
diff --git a/cmake/add_macros.cmake b/cmake/add_macros.cmake
new file mode 100644
index 00000000..12411693
--- /dev/null
+++ b/cmake/add_macros.cmake
@@ -0,0 +1,101 @@
+
+macro (_default_if_unset VAR VAL)
+  if (NOT ${VAR})
+    set (${VAR} ${VAL})
+  endif()
+endmacro()
+
+include (parse_arguments)
+
+function (extended_add_library)
+  set (options POSITION_INDEPENDENT PRECOMPILED INSTALL)
+  set (one_value_options NAME NAMESPACE TYPE INSTALL_DESTINATION)
+  set (multi_value_options
+    LIBRARIES SOURCES PUBLIC_HEADERS INCLUDE_DIRECTORIES RPATH
+    SYSTEM_INCLUDE_DIRECTORIES COMPILE_DEFINITIONS COMPILE_OPTIONS DEPENDS
+  )
+  set (required_options NAME)
+  _parse_arguments (ARG "${options}" "${one_value_options}" "${multi_value_options}" "${required_options}" ${ARGN})
+
+  _default_if_unset (ARG_TYPE "STATIC")
+  _default_if_unset (ARG_INSTALL_DESTINATION "lib")
+
+  if (ARG_NAMESPACE)
+    set (target_name "${ARG_NAMESPACE}-${ARG_NAME}")
+  else()
+    set (target_name "${ARG_NAME}")
+  endif()
+
+  if (NOT (${ARG_TYPE} STREQUAL "STATIC" OR ${ARG_TYPE} STREQUAL "SHARED" OR ${ARG_TYPE} STREQUAL "MODULE"))
+    message (FATAL_ERROR "Bad library type: ${ARG_TYPE}")
+  endif()
+
+  set (_scope_specifier)
+  if ((NOT ARG_SOURCES AND NOT ARG_MOC) OR ARG_PRECOMPILED)
+    set (_scope_specifier INTERFACE)
+
+    add_library (${target_name} INTERFACE)
+
+    if (ARG_PRECOMPILED)
+      if (ARG_TYPE STREQUAL "STATIC")
+        list (APPEND ARG_LIBRARIES "${CMAKE_CURRENT_SOURCE_DIR}/lib${target_name}.a")
+      else()
+        list (APPEND ARG_LIBRARIES "${CMAKE_CURRENT_SOURCE_DIR}/lib${target_name}.so")
+      endif()
+    endif()
+
+    target_link_libraries (${target_name} INTERFACE ${ARG_LIBRARIES})
+  else()
+    set (_scope_specifier PUBLIC)
+
+   # _moc (${ARG_NAME}_mocced ${ARG_MOC})
+
+    add_library (${target_name} ${ARG_TYPE} #${${ARG_NAME}_mocced}
+                 ${ARG_SOURCES})
+
+    target_link_libraries (${target_name} ${ARG_LIBRARIES})
+  endif()
+  if (ARG_NAMESPACE)
+    add_library (${ARG_NAMESPACE}::${ARG_NAME} ALIAS ${target_name})
+  endif()
+  if (ARG_PUBLIC_HEADERS)
+    set_property (TARGET ${target_name} APPEND
+      PROPERTY PUBLIC_HEADER ${ARG_PUBLIC_HEADERS}
+    )
+  endif()
+
+  if (ARG_SYSTEM_INCLUDE_DIRECTORIES)
+    target_include_directories (${target_name} SYSTEM
+      ${ARG_SYSTEM_INCLUDE_DIRECTORIES})
+  endif()
+  if (ARG_INCLUDE_DIRECTORIES)
+    target_include_directories (${target_name} PUBLIC
+                                $<BUILD_INTERFACE:${ARG_INCLUDE_DIRECTORIES}>)
+  endif()
+
+  if (ARG_POSITION_INDEPENDENT)
+    set_property (TARGET ${target_name} APPEND
+      PROPERTY COMPILE_FLAGS -fPIC
+    )
+  endif()
+
+  if (ARG_DEPENDS)
+    add_dependencies (${target_name} ${ARG_DEPENDS})
+  endif()
+
+  if (ARG_COMPILE_DEFINITIONS)
+    target_compile_definitions (${target_name} ${_scope_specifier} ${ARG_COMPILE_DEFINITIONS})
+  endif()
+
+  if (ARG_COMPILE_OPTIONS)
+    target_compile_options (${target_name} ${_scope_specifier} ${ARG_COMPILE_OPTIONS})
+  endif()
+
+  if (ARG_INSTALL)
+    install (TARGETS ${target_name}
+      LIBRARY DESTINATION "${ARG_INSTALL_DESTINATION}"
+      ARCHIVE DESTINATION "${ARG_INSTALL_DESTINATION}"
+    )
+  endif()
+endfunction()
+
diff --git a/cmake/add_test.cmake b/cmake/add_test.cmake
new file mode 100644
index 00000000..a803e2cb
--- /dev/null
+++ b/cmake/add_test.cmake
@@ -0,0 +1,174 @@
+include (parse_arguments)
+
+function (compile_tarantella_test)
+  set(one_value_options NAME DESCRIPTION)
+  set(multi_value_options SOURCES LIBRARIES INCLUDE_DIRECTORIES
+                          SYSTEM_INCLUDE_DIRECTORIES ARGS COMPILE_FLAGS)
+  set(required_options NAME SOURCES)
+
+  # save each argument into a variable named "ARG_argname"
+  _parse_arguments_with_unknown(ARG "${options}" "${one_value_options}" 
+                                    "${multi_value_options}" "${required_options}" ${ARGN})
+
+  _default_if_unset(ARG_DESCRIPTION "${ARG_NAME}")
+  set(target_name ${ARG_NAME})
+
+  add_executable (${target_name} ${ARG_SOURCES})
+  list (APPEND ARG_LIBRARIES Boost::unit_test_framework
+                             Boost::dynamic_linking)
+  target_compile_definitions (${target_name} PRIVATE
+    "-DBOOST_TEST_MODULE=\"${ARG_DESCRIPTION}\""
+    "-DBOOST_TEST_DYN_LINK")
+
+  #! \note Use RPATH for all tests
+  set_property (TARGET ${target_name} PROPERTY BUILD_WITH_INSTALL_RPATH true)
+  set_property (TARGET ${target_name} APPEND PROPERTY 
+                INSTALL_RPATH 
+                ${Boost_INCLUDE_DIR}/../lib:${CMAKE_BINARY_DIR})
+
+  if (Boost_VERSION VERSION_EQUAL 1.60 OR Boost_VERSION VERSION_GREATER 1.60)
+    list (INSERT ARG_ARGS 0 "--")
+  endif()
+ 
+  if (ARG_SYSTEM_INCLUDE_DIRECTORIES)
+    target_include_directories (${target_name} SYSTEM
+      ${ARG_SYSTEM_INCLUDE_DIRECTORIES})
+  endif()
+  if (ARG_INCLUDE_DIRECTORIES)
+    target_include_directories (${target_name} PRIVATE ${ARG_INCLUDE_DIRECTORIES})
+  endif()
+
+  target_link_libraries (${target_name} ${ARG_LIBRARIES})
+  if (ARG_COMPILE_FLAGS)
+    set_property (TARGET ${target_name} PROPERTY COMPILE_FLAGS ${ARG_COMPILE_FLAGS})
+  endif()
+endfunction()
+
+function (tarantella_gen_environment_paths)
+  set(multi_value_options VARIABLE_LIST)
+  set(required_options VARIABLE_LIST)
+  _parse_arguments(ARG "${options}" "${one_value_options}" 
+                       "${multi_value_options}" "${required_options}" ${ARGN})
+  set(env_var_names PATH LIBRARY_PATH LD_LIBRARY_PATH DYLD_LIBRARY_PATH CPATH PYTHONPATH)
+  set(env_vars )
+
+  foreach (var_name ${env_var_names})
+    if (DEFINED ENV{${var_name}})
+      list(APPEND env_vars "${var_name}=$ENV{${var_name}}")
+    endif()
+  endforeach()
+  set(${ARG_VARIABLE_LIST} ${env_vars} PARENT_SCOPE)
+endfunction()
+
+function (tarantella_gen_executable_script)
+  set(one_value_options SCRIPT_DIR SCRIPT_NAME)
+  set(required_options SCRIPT_DIR SCRIPT_NAME)
+  _parse_arguments(ARG "${options}" "${one_value_options}" 
+                       "${multi_value_options}" "${required_options}" ${ARGN})
+
+  set(tmp_script_path ${CMAKE_CURRENT_BINARY_DIR}/tmp/${ARG_SCRIPT_NAME})
+  file(REMOVE ${ARG_SCRIPT_DIR}/${ARG_SCRIPT_NAME})
+  file(WRITE ${tmp_script_path} "")
+  file(COPY ${tmp_script_path} 
+       DESTINATION ${ARG_SCRIPT_DIR}
+       FILE_PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE
+       )
+  file(REMOVE ${tmp_script_path})
+endfunction()
+
+function (tarantella_gen_gpi_machinefile)
+  set(one_value_options NRANKS FILENAME)
+  set(required_options NRANKS FILENAME)
+  _parse_arguments(ARG "${options}" "${one_value_options}" 
+                       "${multi_value_options}" "${required_options}" ${ARGN})
+
+  file(WRITE ${ARG_FILENAME} "")
+  cmake_host_system_information(RESULT hostname QUERY HOSTNAME)
+  foreach(index RANGE 1 ${ARG_NRANKS})
+    file(APPEND ${ARG_FILENAME} "${hostname}\n")
+  endforeach()
+endfunction()
+
+function (tarantella_gen_test_script)
+  set(one_value_options NAME SCRIPT_DIR TEST_FILE)
+  set(options IS_PYTHON_TEST)
+  set(required_options NAME SCRIPT_DIR TEST_FILE)
+  _parse_arguments_with_unknown(ARG "${options}" "${one_value_options}" 
+                                    "${multi_value_options}" "${required_options}" ${ARGN})
+  
+  message(STATUS "Test: Generating ${ARG_NAME} script")
+  tarantella_gen_executable_script(SCRIPT_NAME ${ARG_NAME}
+                                   SCRIPT_DIR ${ARG_SCRIPT_DIR})
+
+  tarantella_gen_environment_paths(VARIABLE_LIST env_paths)
+
+  set(script_path ${ARG_SCRIPT_DIR}/${ARG_NAME})
+  foreach (var ${env_paths})
+    file(APPEND ${script_path} "export ${var}\n")
+  endforeach()
+  if (ARG_IS_PYTHON_TEST)
+    # Python test
+    file(APPEND ${script_path} "export PYTHONPATH=${CMAKE_BINARY_DIR}:${CMAKE_SOURCE_DIR}/src:\$\{PYTHONPATH\}\n")
+    file(APPEND ${script_path} "\n${Python_EXECUTABLE} -m pytest ${ARG_TEST_FILE}\n")
+  else()
+    # regular executable test
+    file(APPEND ${script_path} "\n${ARG_TEST_FILE}\n")
+  endif()
+endfunction()
+
+function (tarantella_add_gpi_test)
+  set(one_value_options NAME TARGET_FILE NRANKS RUNCOMMAND TEST_FILE
+                        MACHINEFILE CLEANUP TIMEOUT SLEEP)
+  set(multi_value_options LABELS)
+  set(required_options NAME TARGET_FILE NRANKS RUNCOMMAND)
+  _parse_arguments_with_unknown(ARG "${options}" "${one_value_options}"
+                                    "${multi_value_options}" "${required_options}" ${ARGN})
+  _default_if_unset(ARG_SLEEP 0)  
+  set(test_name ${ARG_NAME}_${ARG_NRANKS}ranks)
+
+  # increase overall timeout time to include the sleep time after the actual test
+  if (ARG_TIMEOUT)
+    math(EXPR ARG_TIMEOUT "${ARG_SLEEP} + ${ARG_TIMEOUT}")
+  endif()
+
+  if (ARG_MACHINEFILE)
+    # use user-defined machinefile
+    set(runparams "-n ${ARG_NRANKS} -m ${ARG_MACHINEFILE}")
+  else()
+    # generate machinefile for ARG_NRANKS running on the localhost
+    set(machinefile_path ${CMAKE_CURRENT_BINARY_DIR}/machinefile_${ARG_NAME}_${ARG_NRANKS}.tmp)
+    tarantella_gen_gpi_machinefile(NRANKS ${ARG_NRANKS} 
+                                   FILENAME ${machinefile_path})
+    set(runparams "-n ${ARG_NRANKS} -m ${machinefile_path}")
+  endif()
+
+  # create gaspi_run test
+  add_test(NAME ${test_name}
+          WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+          COMMAND "${CMAKE_COMMAND}"
+            -DRUNCOMMAND=${ARG_RUNCOMMAND}
+            -DRUNCOMMAND_ARGS="${runparams}"
+            -DTEST_EXECUTABLE="${ARG_TARGET_FILE}"
+            -DTEST_DIR="${CMAKE_BINARY_DIR}"
+            -DSLEEP="${ARG_SLEEP}"
+            -P "${CMAKE_SOURCE_DIR}/cmake/run_test.cmake"
+          ) 
+
+  # set labels if specified
+  if (ARG_LABELS)
+    set_property(TEST ${test_name} PROPERTY LABELS ${ARG_LABELS})
+  endif()
+
+  # set cleanup fixture script if specified
+  if (ARG_CLEANUP)
+    set_tests_properties(${test_name} PROPERTIES FIXTURES_REQUIRED ${ARG_CLEANUP})
+  endif()
+  
+  # set timeout if specified
+  if (ARG_TIMEOUT)
+    set_tests_properties(${test_name} PROPERTIES TIMEOUT ${ARG_TIMEOUT})
+  endif()
+
+  # make sure the GPI tests are not run in parallel 
+  set_tests_properties(${test_name} PROPERTIES RESOURCE_LOCK GPI_run_serial)
+endfunction()
diff --git a/cmake/add_test_wrappers.cmake b/cmake/add_test_wrappers.cmake
new file mode 100644
index 00000000..119c1ef8
--- /dev/null
+++ b/cmake/add_test_wrappers.cmake
@@ -0,0 +1,151 @@
+include (add_test)
+
+function (tarantella_compile_and_generate_gpi_test)
+  set (one_value_options NAME DESCRIPTION TIMEOUT)
+  set (multi_value_options LOCALRANKS_LIST SOURCES LIBRARIES INCLUDE_DIRECTORIES
+                           SYSTEM_INCLUDE_DIRECTORIES ARGS COMPILE_FLAGS)
+  set (required_options NAME SOURCES LOCALRANKS_LIST)
+  _parse_arguments (ARG "${options}" "${one_value_options}" 
+                        "${multi_value_options}" "${required_options}" ${ARGN})
+  _default_if_unset (ARG_TIMEOUT 10)
+  set(CLEANUP_TEST_NAME gpi_cleanup)
+
+  set (target_name ${ARG_NAME}.test)
+  compile_tarantella_test(${ARGN}
+                          NAME ${target_name})
+
+  # wrap call to the test executable in a script that exports the current environment
+  # the script can then be executed within a `gaspi_run` call
+  set(script_name run_${ARG_NAME}.sh)
+  set(script_path ${CMAKE_CURRENT_BINARY_DIR}/${script_name})
+  tarantella_gen_test_script(NAME ${script_name}
+                             SCRIPT_DIR ${CMAKE_CURRENT_BINARY_DIR}
+                             TEST_FILE ${CMAKE_CURRENT_BINARY_DIR}/${target_name})
+
+  message(STATUS "Test: Generating gaspi_run tests for ${ARG_NAME} with ${ARG_LOCALRANKS_LIST} ranks")
+  foreach(nlocalranks ${ARG_LOCALRANKS_LIST})
+    tarantella_add_gpi_test (NAME ${ARG_NAME}
+                NRANKS ${nlocalranks}
+                TARGET_FILE ${script_path}
+                TEST_FILE "${CMAKE_CURRENT_BINARY_DIR}/${target_name}"
+                RUNCOMMAND ${GPI2_GASPI_RUN}
+                CLEANUP ${CLEANUP_TEST_NAME}
+                TIMEOUT ${ARG_TIMEOUT}
+                SLEEP ${SLEEP_TIME_AFTER_TEST})
+  endforeach()
+endfunction()
+
+function (tarantella_compile_and_generate_test)
+  set (one_value_options NAME DESCRIPTION TIMEOUT)
+  set (multi_value_options SOURCES LIBRARIES INCLUDE_DIRECTORIES
+                           SYSTEM_INCLUDE_DIRECTORIES ARGS COMPILE_FLAGS
+                           LABELS)
+  set (required_options NAME SOURCES)
+  _parse_arguments (ARG "${options}" "${one_value_options}" 
+                        "${multi_value_options}" "${required_options}" ${ARGN})
+  _default_if_unset (ARG_TIMEOUT 10)
+
+  set (target_name ${ARG_NAME}.test)
+  compile_tarantella_test(${ARGN}
+                          NAME ${target_name})
+  add_test (NAME ${ARG_NAME}
+            COMMAND $<TARGET_FILE:${target_name}> ${ARGS})
+
+  # set labels if specified
+  if (ARG_LABELS)
+    set_property(TEST ${test_name} PROPERTY LABELS ${ARG_LABELS})
+  endif()
+
+  # set timeout if specified
+  if (ARG_TIMEOUT)
+    set_tests_properties(${test_name} PROPERTIES TIMEOUT ${ARG_TIMEOUT})
+  endif()
+endfunction()
+
+function (tarantella_generate_python_gpi_test)
+  set (one_value_options NAME TEST_FILE DESCRIPTION TIMEOUT)
+  set (multi_value_options LOCALRANKS_LIST LABELS ARGS)
+  set (required_options NAME TEST_FILE LOCALRANKS_LIST)
+  _parse_arguments (ARG "${options}" "${one_value_options}" 
+                        "${multi_value_options}" "${required_options}" ${ARGN})
+  set(CLEANUP_TEST_NAME gpi_cleanup)
+  _default_if_unset (ARG_TIMEOUT 600)
+  _default_if_unset (ARG_LABELS "Python")
+
+  list(APPEND ARG_LABELS "Python")
+  list(REMOVE_DUPLICATES ARG_LABELS)
+  
+  # wrap call to the test executable in a script that exports the current environment
+  # the script can then be executed within a `gaspi_run` call
+  set(script_name run_${ARG_NAME}.sh)
+  set(script_path ${CMAKE_CURRENT_BINARY_DIR}/${script_name})
+  tarantella_gen_test_script(NAME ${script_name}
+                             SCRIPT_DIR ${CMAKE_CURRENT_BINARY_DIR}
+                             TEST_FILE ${ARG_TEST_FILE}
+                             IS_PYTHON_TEST)
+
+  message(STATUS "Test: Generating gaspi_run tests for ${ARG_NAME} with ${ARG_LOCALRANKS_LIST} ranks")
+  foreach(nlocalranks ${ARG_LOCALRANKS_LIST})
+    tarantella_add_gpi_test (NAME ${ARG_NAME}
+                NRANKS ${nlocalranks}
+                TARGET_FILE ${script_path}
+                TEST_FILE "${ARG_TEST_FILE}"
+                RUNCOMMAND ${GPI2_GASPI_RUN}
+                TIMEOUT ${ARG_TIMEOUT}
+                CLEANUP ${CLEANUP_TEST_NAME}
+                SLEEP ${SLEEP_TIME_AFTER_TEST}
+                LABELS ${ARG_LABELS})
+  endforeach()
+endfunction()
+
+function (tarantella_generate_python_test)
+  set (one_value_options NAME TEST_FILE DESCRIPTION TIMEOUT)
+  set (multi_value_options LABELS ARGS)
+  set (required_options NAME TEST_FILE)
+  _parse_arguments (ARG "${options}" "${one_value_options}"
+                        "${multi_value_options}" "${required_options}" ${ARGN})
+  set(CLEANUP_TEST_NAME gpi_cleanup)
+  _default_if_unset (ARG_TIMEOUT 600)
+  _default_if_unset (ARG_LABELS "Python")
+
+  list(APPEND ARG_LABELS "Python")
+  list(REMOVE_DUPLICATES ARG_LABELS)
+
+  # wrap call to the test executable in a script that exports the current environment
+  # the script can then be executed within a `gaspi_run` call
+  set(script_name run_${ARG_NAME}.sh)
+  set(script_path ${CMAKE_CURRENT_BINARY_DIR}/${script_name})
+  tarantella_gen_test_script(NAME ${script_name}
+                             SCRIPT_DIR ${CMAKE_CURRENT_BINARY_DIR}
+                             TEST_FILE ${ARG_TEST_FILE}
+                             IS_PYTHON_TEST)
+
+  # create gaspi_run test
+  add_test(NAME ${ARG_NAME}
+           WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+           COMMAND "${CMAKE_COMMAND}"
+             -DRUNCOMMAND=bash
+             -DRUNCOMMAND_ARGS=" "
+             -DTEST_EXECUTABLE="${script_path}"
+             -DTEST_DIR="${CMAKE_BINARY_DIR}"
+             -DSLEEP="1"
+             -P "${CMAKE_SOURCE_DIR}/cmake/run_test.cmake"
+          )
+
+  # set labels if specified
+  if (ARG_LABELS)
+    set_property(TEST ${ARG_NAME} PROPERTY LABELS ${ARG_LABELS})
+  endif()
+
+  # set cleanup fixture script if specified
+  if (ARG_CLEANUP)
+    set_tests_properties(${ARG_NAME} PROPERTIES FIXTURES_REQUIRED ${ARG_CLEANUP})
+  endif()
+
+  # set timeout if specified
+  if (ARG_TIMEOUT)
+    set_tests_properties(${ARG_NAME} PROPERTIES TIMEOUT ${ARG_TIMEOUT})
+  endif()
+
+  message(STATUS "Test: Generating test ${ARG_NAME}")
+endfunction()
diff --git a/cmake/cleanup.sh b/cmake/cleanup.sh
new file mode 100644
index 00000000..2562bf59
--- /dev/null
+++ b/cmake/cleanup.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+procs=`ps aux | grep --regexp="\(py\)\?test" | grep -v ctest | grep -v grep`
+if [ -n "$procs" ] ;then
+  ps aux | grep --regexp="\(py\)\?test" | grep -v ctest | grep -v grep | awk '{print $2}' | xargs kill 2>&1 > /dev/null
+fi
diff --git a/cmake/parse_arguments.cmake b/cmake/parse_arguments.cmake
new file mode 100644
index 00000000..2d4290d8
--- /dev/null
+++ b/cmake/parse_arguments.cmake
@@ -0,0 +1,27 @@
+# equivalent to CMakeParseArguments except that parse_arguments
+# * forbids UNPARSED_ARGUMENTS but requires to explicitly use
+#   parse_arguments_with_unknown
+# * allows to specify required arguments
+
+include (CMakeParseArguments)
+
+macro (_parse_arguments _prefix _options _one_value_options _multi_value_options _required_options)
+  _parse_arguments_with_unknown ("${_prefix}" "${_options}" "${_one_value_options}" "${_multi_value_options}" "${_required_options}" ${ARGN})
+
+  if (${_prefix}_UNPARSED_ARGUMENTS)
+    list (LENGTH ${_prefix}_UNPARSED_ARGUMENTS _unparsed_length)
+    if (NOT _unparsed_length EQUAL 0)
+      message (FATAL_ERROR "unknown arguments: ${${_prefix}_UNPARSED_ARGUMENTS}")
+    endif()
+  endif()
+endmacro()
+
+macro (_parse_arguments_with_unknown _prefix _options _one_value_options _multi_value_options _required_options)
+  cmake_parse_arguments ("${_prefix}" "${_options}" "${_one_value_options}" "${_multi_value_options}" ${ARGN})
+
+  foreach (required ${_required_options})
+    if (NOT ${_prefix}_${required})
+      message (FATAL_ERROR "required argument ${required} missing")
+    endif()
+  endforeach()
+endmacro()
diff --git a/cmake/run_test.cmake b/cmake/run_test.cmake
new file mode 100644
index 00000000..88ef0e67
--- /dev/null
+++ b/cmake/run_test.cmake
@@ -0,0 +1,50 @@
+# Kill old processes that may be still running
+function (kill_old_processes)
+  set(one_value_options TEST_DIR TEST_EXECUTABLE)
+  cmake_parse_arguments(ARG "${options}" "${one_value_options}" 
+                            "${multi_value_options}" ${ARGN})
+
+  set(find_processes_command "ps -ef | grep ${ARG_TEST_DIR} | grep -v grep | grep -v ${ARG_TEST_EXECUTABLE}")
+  set(kill_command "${find_processes_command} | awk '{print $2}' | xargs -r kill -9")
+
+  execute_process(COMMAND sh -c  "echo \"Killing `${find_processes_command} | wc -l` processes\"; ${find_processes_command}")
+  execute_process(COMMAND sh -c "${kill_command}"
+                  COMMAND_ECHO STDOUT)
+endfunction()
+
+foreach(var TEST_DIR TEST_EXECUTABLE RUNCOMMAND RUNCOMMAND_ARGS SLEEP)
+  if(NOT DEFINED ${var})
+    message(FATAL_ERROR "'${var}' must be defined on the command line")
+  endif()
+
+  separate_arguments(var_value UNIX_COMMAND "${${var}}")
+  string(LENGTH "${var_value}" var_length)
+  if (var_length LESS 1)
+    message(FATAL_ERROR "'${var}' must be defined on the command line and not be empty")
+  endif()
+endforeach()
+
+separate_arguments(runparams_list UNIX_COMMAND "${RUNCOMMAND_ARGS}")
+separate_arguments(all_command_params UNIX_COMMAND 
+                  "${runparams_list} ${TEST_EXECUTABLE} ${TEST_ARGS}")
+kill_old_processes(TEST_DIR ${TEST_DIR}
+                   TEST_EXECUTABLE ${TEST_EXECUTABLE})
+
+# Execute the test-executable
+execute_process(COMMAND ${RUNCOMMAND} ${all_command_params}
+                COMMAND_ECHO STDOUT
+                RESULT_VARIABLE result)
+
+# Sleep to ensure all processes are done and kill the remainder
+separate_arguments(sleep_time UNIX_COMMAND "${SLEEP}")
+execute_process(COMMAND ${CMAKE_COMMAND} -E sleep "${sleep_time}"
+                COMMAND ${CMAKE_COMMAND} -E echo "Sleep ${sleep_time}")
+kill_old_processes(TEST_DIR ${TEST_DIR}
+                   TEST_EXECUTABLE ${TEST_EXECUTABLE})
+
+# Check return status
+if(result)
+  message(FATAL_ERROR "Test failed:'${result}'") 
+endif()
+
+
diff --git a/cmake/version.py.in b/cmake/version.py.in
new file mode 100644
index 00000000..863f92f9
--- /dev/null
+++ b/cmake/version.py.in
@@ -0,0 +1,2 @@
+global tnt_version
+tnt_version = "@PROJECT_VERSION@"
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
new file mode 100644
index 00000000..afe5b076
--- /dev/null
+++ b/docs/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(SPHINX_SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/source)
+set(SPHINX_BUILD ${CMAKE_CURRENT_BINARY_DIR}/)
+
+if (Sphinx_FOUND)
+  add_custom_target(docs ALL
+                    COMMAND
+                    Sphinx::Sphinx -b html
+                    -Drelease=${PROJECT_VERSION}
+                    ${SPHINX_SOURCE} ${SPHINX_BUILD}
+                    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+                    COMMENT "Generating documentation with Sphinx")
+
+  install(DIRECTORY ${SPHINX_BUILD}
+        DESTINATION ${CMAKE_INSTALL_PREFIX}/docs)
+else()
+  message(STATUS "Sphinx not found. Skipping documentation build.")
+endif()
\ No newline at end of file
diff --git a/docs/source/advanced_topics.rst b/docs/source/advanced_topics.rst
new file mode 100644
index 00000000..28caa105
--- /dev/null
+++ b/docs/source/advanced_topics.rst
@@ -0,0 +1,144 @@
+Advanced Topics
+===============
+
+This guide covers a number of advanced topics, such as
+performance, reproducibility and user customization.
+
+
+.. _ranks-label:
+
+GASPI ranks
+^^^^^^^^^^^
+
+In order to execute distributed DNN training, Tarantella starts multiple processes
+on different devices. These processes will be assigned different IDs by the GASPI
+communication library, in order to organize communication and synchronization between
+the different devices. These IDs are called *ranks*. Usually, Tarantella abstracts away
+the concept of *ranks*, in such a way that Tarantella's user interface is essentially
+the same as Keras' user interface.
+
+However, sometimes it is useful, to execute a specific part of code only on one
+or a subgroup of all ranks. In particular, one sometimes wants to execute a code
+block on the devices that started ``tarantella``, the so-called *master rank*.
+
+To access ranks, Tarantella provides the following functions
+
+* ``tnt.get_rank()``
+* ``tnt.get_size()``
+* ``tnt.get_master_rank()``
+* ``tnt.is_master_rank()``
+
+``tnt.get_rank()`` returns the ID of the local rank.
+``tnt.get_size()`` returns the total number of ranks.
+``tnt.get_master_rank()`` and ``tnt.is_master_rank()`` return the ID of the master rank
+and a boolean for whether the local rank is the master rank or not, respectively.
+
+Here is a simple example, when using the master rank can be useful to print notifications
+only once to ``stdout``:
+
+.. code-block:: python
+
+   if tnt.is_master_rank():
+     print("Printing from the master rank")
+
+In the same vein, you might want to use ranks to execute :ref:`callbacks <callbacks-label>` for logging 
+only on one rank:
+
+.. code-block:: python
+
+   history_callback = tf.keras.callbacks.History()
+   tnt_model.fit(train_dataset,
+                 callbacks = [history_callback] if tnt.is_master_rank() else [])
+
+
+.. _using-local-batch-sizes-label:
+
+Using local batch sizes
+^^^^^^^^^^^^^^^^^^^^^^^
+
+As it has been stated in the :ref:`points to consider <points-to-consider-label>`, when using
+Tarantella the user always specifies the *global* batch size. This has the advantage that
+the optimization process during the training of a DNN, and in particular the loss function do not
+depend on the number of devices used during execution.
+
+However, when the number of devices becomes
+very large, the (device-local) micro-batch size might become so small, that DNN kernel implementations
+are less efficient, resulting in overall performance degradation.
+This is why it is in practice often advisable to scale the global batch size with the number of nodes.
+This will often lead to linear speedups in terms of the time to accuracy when increasing
+the number of devices used, at least up to some *critical batch size*, cf. [Shallue]_ and [McCandlish]_.
+Changing the batch size of the optimizer will however also imply the need to adapt the learning rate
+schedule.
+
+.. todo::
+  
+  Enable when the Tutorial is updated:
+  For details, cf. for instance the :ref:`ResNet-50 tutorial <resnet50-label>`.
+
+If you decide to scale the batch size with the number of nodes, Tarantella provides
+two different ways to achieve this easily. The first option is to multiply the local batch size
+(for instance passed via a command-line parameter) with the number of devices used,
+batch your dataset with it, and call ``fit`` on it:
+
+.. code-block:: python
+
+   micro_batch_size = args.micro_batch_size
+   batch_size = tnt.get_size() * micro_batch_size
+   train_dataset = train_dataset.batch(batch_size)
+   tnt_model.fit(train_dataset)
+
+As a second option you can also pass the local batch size directly to the ``tnt_micro_batch_size``
+parameter in fit, and leave your dataset unbatched:
+
+.. code-block:: python
+
+   micro_batch_size = args.micro_batch_size
+   tnt_model.fit(train_dataset,
+                 tnt_micro_batch_size = micro_batch_size)
+
+This parameter is also available in ``evaluate`` and ``predict``. In addition, ``fit`` also supports
+setting the validation set micro batch size in a similar way with ``tnt_validation_micro_batch_size``.
+For more information, please also read :ref:`using distributed datasets <using-distributed-datasets-label>`.
+
+
+.. _tensor-fusion-threshold-label:
+
+Setting Tensor Fusion threshold
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Tarantella automatically uses :ref:`Tensor Fusion <tensor-fusion-label>` with a default
+threshold of 32kB. This threshold specifies the minimal size of local buffers in *allreduce*
+communication operations used to accumulate partial gradients during *backpropagation*.
+
+Note that the threshold value implies a trade-off between the potential to utilize network
+bandwidth, and the overlap of computation and communication during *backpropagation*. The
+larger the threshold, the more bandwidth-bound the *allreduce* algorithm will get, but
+the less potential there will be to overlap its execution with kernel computations.
+Also note that the ideal threshold value will generally depend on the number of nodes used.
+
+To change the default value, you can pass a threshold value in kB to ``tarantella``:
+
+.. code-block:: bash
+
+   tarantella --hostfile hostfile --fusion-threshold=<FUSION_THRESHOLD_KB> -- model.py
+
+
+.. _reproducibility-label:
+
+Reproducibility
+^^^^^^^^^^^^^^^
+
+Reproducibility is a very important prerequisite to obtain meaningful results in
+scientific computing and research. Unfortunately, using stochastic algorithms,
+pseudo random generators and having to deal with the pitfalls of floating-point arithmetics,
+it is particularly difficult to achieve reproducibility in Deep Learning research.
+
+In order to be able to reproduce results obtained with TensorFlow, when running in
+a multi-node/multi-device setting with Tarantella, one needs to meet at least 
+the following requirements:
+
+* set the random seed with ``tf.random.set_seed(seed)``
+* set the environment variable ``os.environ['TF_CUDNN_DETERMINISTIC']='1'``
+* set the shuffle seeds when using ``tf.data.Dataset`` with ``shuffle(seed=seed)`` and ``list_files(seed=seed)``
+* set the ``deterministic`` parameter to ``True`` in ``Dataset`` transformations such as ``interleave`` and ``map``
+* make sure the number of samples in your datasets equal a multiple of ``batch_size``
diff --git a/docs/source/bug_reports.rst b/docs/source/bug_reports.rst
new file mode 100644
index 00000000..20f632ec
--- /dev/null
+++ b/docs/source/bug_reports.rst
@@ -0,0 +1,35 @@
+.. _bug-reports-label:
+
+Bug Reports
+===========
+
+To report a bug please open an `issue on GitHub <https://github.com/cc-hpc-itwm/tarantella/issues>`_.
+
+When opening an issue, please make sure you include as much
+information as possible about the issue. Please consider providing at
+least the following points:
+
+  * What version of Tarantella you are using
+  * What linux distribution you are using (e.g., Linux Ubuntu 20.04)
+  * What kind of system you are experiencing the issue on (type and
+    number of nodes, network interconnect, etc.)
+  * What did you expect to see and what have you seen instead
+  * What exact steps are needed to reproduce the issue
+
+.. _feature-requests-label:
+
+Feature Requests
+================
+
+For contributions other than modifications to the source code, as for
+example suggestions of a feature or enhancement, please open
+an `issue on GitHub <https://github.com/cc-hpc-itwm/tarantella/issues>`_
+with the label ``Feature``.
+
+When providing a feature request, please consider providing at least
+the following information:
+
+  * What is the current behavior of the software and how does the feature improve it
+  * Who would benefit from the feature
+  * Is there a relevant reference or academic paper describing the feature
+  * Are you willing to contribute to and/or maintain the feature
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 00000000..cc2e7e3b
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,72 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'Tarantella'
+copyright = '2020 Fraunhofer'
+author = 'Peter Labus, Alexandra Carpen-Amarie, Martin Kuehn'
+
+# The full version, including alpha/beta/rc tags
+release = '0'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['sphinx.ext.todo']
+try:
+  import sphinx_rtd_theme
+  extensions += ['sphinx_rtd_theme']
+except:
+  pass
+
+# Display TODOs by setting to True
+todo_include_todos = False
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+# html_theme = 'alabaster' # default
+try:
+  import sphinx_rtd_theme
+  html_theme = "sphinx_rtd_theme"
+except:
+  pass
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+html_title = release
+html_theme_options = dict()
+html_theme_options ['logo_only'] = False
+# html_theme_options['display_version']= True
+# html_logo = "pics/tnt_logo.png"
diff --git a/docs/source/contact.rst b/docs/source/contact.rst
new file mode 100644
index 00000000..472374e2
--- /dev/null
+++ b/docs/source/contact.rst
@@ -0,0 +1,14 @@
+.. _contact-label:
+
+Contact
+=======
+
+In case you have any feature request,
+or want to report a bug please follow
+:ref:`these instructions <bug-reports-label>`.
+
+If you consider contributing to Tarantella, please follow
+the instructions :ref:`here <contributing-label>`.
+
+If you have any further questions or comments please email to
+support@tarantella.org
diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst
new file mode 100644
index 00000000..08cdd8ad
--- /dev/null
+++ b/docs/source/contributing.rst
@@ -0,0 +1,20 @@
+.. _contributing-label:
+
+Contributing
+============
+
+Thank you for considering to contribute to Tarantella.
+
+There are many ways to contribute to Tarantella.
+This includes sharing DNN models distributed through Tarantella,
+providing suggestions on improving the documentation,
+as well as contributing with changes to the
+`Tarantella code base <https://github.com/cc-hpc-itwm/tarantella>`_.
+Even by simply providing suggestions on how we can
+:ref:`improve Tarantella <feature-requests-label>`
+and help spreading the word about it are great ways to contribute
+and make Tarantella better software.
+
+If you want to contribute to Tarantella with changes to its code,
+please open a `pull request <https://github.com/cc-hpc-itwm/tarantella/pulls>`_
+on GitHub.
diff --git a/docs/source/data_parallel.rst b/docs/source/data_parallel.rst
new file mode 100644
index 00000000..0a914d47
--- /dev/null
+++ b/docs/source/data_parallel.rst
@@ -0,0 +1,195 @@
+Distributed Data Parallel Training
+==================================
+
+The following section explains the parallelization strategy Tarantella uses to
+provide distributed training. A full understanding thereof is, however, not required 
+to be able to use the software. Please note the :ref:`points to consider <points-to-consider-label>`
+to achieve best performance and reproducibility.
+
+The general idea
+----------------
+
+In order to parallelize the training of DNNs, different, complementary strategies are available.
+The conceptually simplest and most efficient one is called *data parallelism*. This strategy
+is already in use when deploying batched optimizers, such as stochastic gradient descent (SGD)
+or ADAM. In this case, input samples are grouped together in so-called mini-batches and
+are processed in parallel.
+
+Distribution of mini-batches
+----------------------------
+
+Tarantella extends this scheme by splitting each mini-batch into a number of micro-batches,
+which are then executed on different devices (e.g., GPUs).
+In order to do this, the DNN is replicated on each device,
+which then processes part of the data independently of the other devices.
+During the *backpropagation* pass, partial results need to be accumulated via a so-called
+`allreduce <https://en.wikipedia.org/wiki/Collective_operation#All-Reduce_%5B5%5D>`_
+collective operation.
+
+Overlapping communication with computation
+------------------------------------------
+
+Tarantella implements this communication scheme using the
+`Global Address Space Programming Interface (GASPI) <https://en.wikipedia.org/wiki/Global_Address_Space_Programming_Interface>`_.
+This allows in particular to overlap the communication needed to execute *allreduce* operations
+with the computation done in the *backpropagation* part of the DNN training.
+This is done by starting *allreduce* operations as soon as the required local incoming gradients are
+available, while continuing with *backpropagation* calculations at the same time.
+The final, accumulated gradients are only expected once the entire *backpropagation* is completed.
+This drastically mitigates the communication overhead introduced by the need to synchronize
+the different devices, and leads to higher scalability.
+
+.. _tensor-fusion-label:
+
+Tensor Fusion
+-------------
+
+The granularity at which Tarantella executes *allreduce* operations can be varied from
+one *allreduce* per layer (finest granularity) to one *allreduce* per iteration (coarsest granularity).
+Using coarser granularities, i.e., *fusing* gradient tensors,
+can lead to better bandwidth utilization, thus potentially increasing performance.
+*Tensor Fusion* is set up before the first iteration of training and incurs no additional communication overhead.
+Tarantella enables *Tensor Fusion* by default, but its granularity can be adjusted by the user,
+cf. :ref:`here <tensor-fusion-threshold-label>`.
+
+Model initialization and loading
+--------------------------------
+
+In order to guarantee that all devices have the same copy of the DNN when training is initiated,
+the model needs to be communicated from one device to all the others.
+This is done in Tarantella via the use of a so-called
+`broadcast <https://en.wikipedia.org/wiki/Collective_operation#Broadcast_[3]>`_ operation.
+This scheme applies both when the weights of a DNN are initialized randomly,
+or loaded from a checkpoint.
+As Tarantella provides this functionality automatically,
+the user does not have to take care of it.
+
+.. _points-to-consider-label:
+
+Distributed Datasets
+=====================
+
+In order to process micro-batches independently on each device and to obtain the same results
+as in serial execution, the input data of each mini-batch has to be split and distributed
+among all devices.
+
+Tarantella automatically takes care of this through the use of distributed datasets.
+The user simply provides Tarantella with a ``tf.data.Dataset`` that is batched
+with the mini-batch size. Tarantella will then automatically distribute the input data
+by sharding the mini-batch into individual micro-batches. Sharding is done at the level
+of samples (as opposed to e.g., files) to ensure :ref:`reproducibility <reproducibility-label>`
+of serial results.
+
+To guarantee reproducibility, it is also important that shuffling of samples is done
+in the same way on all devices. Tarantella does this using either the ``seed`` provided
+by the user, or a specific default seed. Please refer to the
+:ref:`Quick Start <using-distributed-datasets-label>`
+for more details.
+
+Points to Consider
+==================
+
+.. _global-vs-local-batch-size-label:
+
+Global versus local batch size
+------------------------------
+
+As explained above, when using data parallelism, there exists a *mini-batch size*
+(in the following also called global batch size or simply batch size) 
+as well as a *micro-batch size* (also called local batch size).
+The former represents the number of samples that
+is averaged over in the loss function of the optimizer, and is equivalent to
+the (mini-)batch size used in non-distributed training. The latter is the number
+of samples that is processed locally by each of the devices per iteration.
+
+.. note::
+
+   In Tarantella, the user always specifies the **global batch size**.
+
+Using a strictly synchronous optimization scheme, and by carefully handling the data distribution,
+**Tarantella guarantees the reproducibility of DNN training results independently of the number of
+devices used**, as long as all hyperparameters (such as global batch size and learning rate)
+are kept constant. [#footnote_random_seeds]_
+
+However, to achieve best performance for certain DNN operators (`Conv2D`, `Dense`, etc.)
+it is often advisable to *keep the local batch size constant*, and scale the global
+batch size with the number of devices used. This, in turn, will force you to
+adjust other hyperparameters, such as the learning rate, in order to converge
+to a comparable test accuracy, as observed for instance in [Shallue]_.
+
+In practice, the use of a learning rate schedule with initial *warm up* and
+a *linear learning rate scaling* [Goyal]_, as it is described
+:ref:`here <resnet50-label>`, often suffices. 
+
+.. tip::
+
+   For best performance, scale the batch size with the number of devices used,
+   and :ref:`adapt the learning rate schedule <resnet50-label>`.
+
+Batch normalization layers
+--------------------------
+
+The issue of global versus local batch size particularly affects the layers
+that calculate (and learn) statistics over entire batches.
+A well-known example of this type of layer is
+`batch normalization <https://en.wikipedia.org/wiki/Batch_normalization>`_.
+
+.. caution::
+
+   Tarantella always calculates batch statistics over **local batches**.
+
+As a consequence, the training results for DNNs with batch-normalization layers
+**will not be identical when changing the number of devices, even if
+the global batch size stays the same.**
+At the moment, this can be circumvented by using normalization layers that
+do *not* average over entire batches, such as instance normalization
+[Ulyanov]_.
+
+Averaging over *local* batches instead of global batches should in practice
+have only minor influence on the quality of the final test accuracy.
+Note however, the extreme case of very small *local* batch sizes.
+
+.. caution::
+
+   Avoid using ``BatchNormalization`` layers when the global batch size
+   divided by the number of devices used is *smaller than 16*.
+
+In such cases, the local batches that are used to collect statistics are
+too small to obtain meaningful results. This will likely reduce the
+benefits of batch normalization, cf. for instance [Yang]_ and [Uppal]_.
+In this case, please consider increasing the global batch size,
+or reducing the number of devices used.
+
+Managing individual devices
+---------------------------
+
+Although Tarantella's user interface abstracts away most of the details of
+parallel programming, it is sometimes useful to be able to control
+Python code execution at device level. This can be achieved using the
+`GASPI <https://en.wikipedia.org/wiki/Global_Address_Space_Programming_Interface>`_ concept
+of a ``rank``. Details on how to do this can be found in the
+:ref:`advanced topics <ranks-label>`.
+
+.. rubric:: References
+
+.. [Shallue] Shallue, Christopher J., et al. "Measuring the effects of data parallelism on neural network training." arXiv preprint arXiv:1811.03600 (2018).
+
+.. [Ulyanov] Ulyanov, Dmitry, Andrea Vedaldi, and Victor Lempitsky. "Instance normalization: The missing ingredient for fast stylization." arXiv preprint arXiv:1607.08022 (2016).
+
+.. [Goyal] Goyal, Priya, et al. "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour." arXiv preprint arXiv:1706.02677 (2017).
+
+.. [Yang] Yang, Greg, et al. "A mean field theory of batch normalization." arXiv preprint arXiv:1902.08129 (2019).
+
+.. [Uppal] https://towardsdatascience.com/curse-of-batch-normalization-8e6dd20bc304
+
+.. [McCandlish] McCandlish, Sam, et al. "An empirical model of large-batch training." arXiv preprint arXiv:1812.06162 (2018).
+
+.. [He] He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
+
+.. [Vaswani] Vaswani, Ashish, et al. "Attention is all you need." Advances in neural information processing systems. 2017.
+
+.. rubric:: Footnotes
+
+.. [#footnote_random_seeds] This is strictly true, only when all randomness in TensorFlow is
+   seeded or switched off, as explained in the :ref:`advanced topics <reproducibility-label>`
+
diff --git a/docs/source/faq.rst b/docs/source/faq.rst
new file mode 100644
index 00000000..b2ba811c
--- /dev/null
+++ b/docs/source/faq.rst
@@ -0,0 +1,80 @@
+.. _faq-label:
+
+Frequently Asked Questions (FAQ)
+================================
+
+This is a list of frequently asked questions about Tarantella.
+Please feel free to :ref:`suggest new ones <contact-label>`!
+
+.. admonition:: Question
+
+   How can I ssh to ``localhost`` without password?
+
+In order to run Tarantella programs, you will need to be able to ssh to ``localhost`` without password.
+In order to do that generate ``ssh`` keys first:
+
+.. code-block:: bash
+
+   cd ~/.ssh
+   ssh-keygen
+
+Make sure not to overwrite existing keys.
+When asked for a passphrase, ``Enter passphrase (empty for no passphrase):``, simply leave empty
+and return with enter.
+Also take specific care to set correct user rights on all files in ``.ssh``,
+cf. for instance `here <https://superuser.com/questions/215504/permissions-on-private-key-in-ssh-folder>`__.
+Next, append the public key to the ``authorized_keys`` file:
+
+.. code-block:: bash
+
+   cat id_rsa.pub >> authorized_keys
+
+Now, install and start an ssh server, e.g., openssh-server on Fedora.
+More details can be found for instance
+`here <https://linuxconfig.org/how-to-install-start-and-connect-to-ssh-server-on-fedora-linux>`__.
+
+.. admonition:: Question
+
+   I get an execution error ``GPI library initialization incorrect environment vars`` when
+   trying to run my script. What shall I do?
+
+Most likely you are running your program with ``python my_script.py`` or ``./my_script.py``.
+Please make sure to execute your code with ``tarantella my_script.py`` instead.
+
+.. admonition:: Question
+
+   I get an execution error ``GPI library initialization general error``. What shall I do?
+
+This error occurs when the GASPI library tries to connect to a previously used socket, that is not yet released.
+Try to re-run your code after a short while so that the port becomes available again.
+
+.. admonition:: Question
+
+   The execution seems to stall. What shall I do?
+
+Please kill any processes that might be still running from a previous (aborted) call to ``tarantella``.
+
+.. admonition:: Question
+
+   | When trying to build Tarantella, CMake cannot find pybind11:
+   | ``Could not find a package configuration file provided by "pybind11" with any``
+   | ``of the following names: [...]``
+   | What shall I do?
+
+This error occurs when pybind11 is installed using pip.
+Please instead use conda, as recommended in the :ref:`installation guide <installation-pybind11-label>`.
+
+.. admonition:: Question
+
+   When trying to build Tarantella, CMake does not detect the Python interpreter from the
+   active conda environment. What shall I do?
+
+You will need to manually add the path to the conda environment's ``bin`` directory to your ``PATH``.
+You will also need to specify the path to the python library on the command line when configuring Tarantella:
+
+.. code-block:: bash
+
+   PATH_TO_CONDA_ENV=/path/to/conda/env
+   export PATH=${PATH_TO_CONDA_ENV}/bin:${PATH}
+   cmake -DPYTHON_EXECUTABLE=${PATH_TO_CONDA_ENV}/bin/python \
+         -DPYTHON_LIBRARY=${PATH_TO_CONDA_ENV}/lib ../
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 00000000..8a3a7e37
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,46 @@
+.. image:: pics/tnt_logo_text.png
+   :width: 750
+   :align: center
+
+|
+`Tarantella <https://github.com/cc-hpc-itwm/tarantella>`_
+is an open-source, distributed Deep Learning framework built on top of TensorFlow 2,
+providing scalable Deep Neural Network training on CPU and GPU compute clusters.
+
+Tarantella is easy-to-use, allows to re-use existing TensorFlow 2/Keras models,
+and does not require any knowledge of parallel computing.
+
+.. image:: pics/tnt_run.gif
+   :width: 750
+   :align: center
+
+|
+
+Table of contents
+=================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Overview
+
+   why_tarantella
+   data_parallel
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Getting started
+
+   installation
+   quick_start
+   tutorials
+   advanced_topics
+   faq
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Community
+
+   bug_reports
+   contributing
+   contact
+   license
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
new file mode 100644
index 00000000..6bb6f809
--- /dev/null
+++ b/docs/source/installation.rst
@@ -0,0 +1,197 @@
+.. _installation-label:
+
+Installation
+============
+
+Tarantella needs to be built `from source <https://github.com/cc-hpc-itwm/tarantella>`_.
+Since Tarantella is built on top of `TensorFlow 2 <https://www.tensorflow.org/>`_,
+you will require a recent version of it. Additionally, you will need an installation of
+the open-source communication library `GPI-2 <http://www.gpi-site.com/>`_, which Tarantella uses
+to communicate between processes.
+Lastly, you will need `pybind11 <https://github.com/pybind/pybind11>`_, which is required
+for Python and C++ inter-communication.
+
+In the following we will look at the required steps in detail.
+
+Installing dependencies
+-----------------------
+
+Compiler and build system
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Tarantella can be built using a recent `gcc <https://gcc.gnu.org/>`_
+compiler (from version ``7.4.0``).
+You will also need the build tool `CMake <https://cmake.org/>`_ (from version ``3.8``).
+
+Installing GPI-2
+^^^^^^^^^^^^^^^^
+
+Next, you will need to download, compile and install the GPI-2 library.
+The currently supported version is ``v1.4.0``, which needs to be built with
+position independent flags (``-fPIC``).
+
+To download the required version, clone the
+`git repository <https://github.com/cc-hpc-itwm/GPI-2.git>`_
+and checkout the correct ``tag``:
+
+.. code-block:: bash
+
+  git clone https://github.com/cc-hpc-itwm/GPI-2.git
+  cd GPI-2
+  git fetch --tags
+  git checkout -b v1.4.0 v1.4.0
+
+Now, use `autotools <https://www.gnu.org/software/automake/>`_ to configure and compile the code
+
+.. code-block:: bash
+
+  ./autogen.sh 
+  export GPI2_INSTALLATION_PATH=/your/installation/path
+  CFLAGS="-fPIC" CPPFLAGS="-fPIC" ./configure --with-ethernet --prefix=${GPI2_INSTALLATION_PATH}
+  make
+
+where ``${GPI2_INSTALLATION_PATH}`` needs to be replaced with the path where you want to install
+GPI-2. Note the ``--with-ethernet`` option, which will use standard TCP sockets for communication.
+This is the correct option for laptops and workstations.
+
+In case you want to use Infiniband, replace the above option with ``--with-infiniband``.
+Now you are ready to install GPI-2 with
+
+.. code-block:: bash
+
+  make install
+  export PATH=${GPI2_INSTALLATION_PATH}/bin:$PATH
+  export LD_LIBRARY_PATH=${GPI2_INSTALLATION_PATH}/lib64:$LD_LIBRARY_PATH
+
+where the last two commands make the library visible to your system.
+If required, GPI-2 can be removed from the target directory by using ``make uninstall``.
+
+Installing TensorFlow 2
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Next you will need to install TensorFlow 2.
+Tarantella supports TensorFlow versions ``2.0`` to ``2.2``.
+Either version can be installed in a conda environment using pip,
+as recommended on the `TensorFlow website <https://www.tensorflow.org/install>`_.
+
+In order to do that, first install `conda <https://docs.conda.io/en/latest/>`_ on your system.
+Then, create and activate an environment for Tarantella:
+
+.. code-block:: bash
+
+  conda create tarantella
+  conda activate tarantella
+
+Now, you can install the latest supported TensorFlow version with
+
+.. code-block:: bash
+
+  conda install python=3.7
+  pip install --upgrade tensorflow==2.2
+
+.. _installation-pybind11-label:
+
+Installing pybind11
+^^^^^^^^^^^^^^^^^^^
+
+The last dependency you will need to install is
+`pybind11 <https://pybind11.readthedocs.io/en/stable/index.html>`__,
+which is available through pip and conda.
+We recommend installing pybind11 via conda:
+
+.. code-block:: bash
+
+  conda install pybind11 -c conda-forge
+
+SSH key-based authentication
+----------------------------
+
+In order to use Tarantella on a cluster, make sure you can ssh between nodes
+without password. For details, refer to the :ref:`FAQ section <faq-label>`.
+In particular, to test Tarantella on your local machine, make sure
+you can ssh to ``localhost`` without password.
+
+Building Tarantella from source
+-------------------------------
+
+With all dependencies installed, we can now download, configure and compile Tarantella.
+To download the source code, simply clone the
+`GitHub repository <https://github.com/cc-hpc-itwm/tarantella.git>`__:
+
+.. code-block:: bash
+
+  git clone https://github.com/cc-hpc-itwm/tarantella.git
+
+Next, we need to configure the build system using CMake.
+For a standard out-of-source build, we create a separate ``build`` folder and run ``cmake``
+in it:
+
+.. code-block:: bash
+
+  cd tarantella
+  mkdir build && cd build
+  export TARANTELLA_INSTALLATION_PATH=/your/installation/path
+  cmake -DCMAKE_INSTALL_PREFIX=${TARANTELLA_INSTALLATION_PATH} ..
+
+Now, we can compile and install Tarantella to ``TARANTELLA_INSTALLATION_PATH``:
+
+.. code-block:: bash
+
+  make
+  make install
+  export PATH=${TARANTELLA_INSTALLATION_PATH}/bin:${PATH}
+
+[Optional] Building and running tests
+-------------------------------------
+
+In order to build Tarantella with tests, you will also need to install
+`Boost <https://www.boost.org/>`_
+(for C++ tests), and `pytest <https://www.pytest.org/>`_ (for Python tests).
+
+To install boost with the required `devel`-packages, under Ubuntu you can use
+
+.. code-block:: bash
+
+  sudo apt install libboost-all-dev
+
+while in Fedora you can use
+
+.. code-block:: bash
+
+  sudo dnf install boost boost-devel
+
+To install pytest you can use pip:
+
+.. code-block:: bash
+
+  pip install -U pytest
+
+After having installed these libraries, make sure to configure Tarantella with testing switched on:
+
+.. code-block:: bash
+
+  cmake -DENABLE_TESTING=ON ..
+
+Now you can compile Tarantella and run its tests in the ``build`` directory.
+
+.. code-block:: bash
+
+  make
+  ctest
+
+[Optional] Building documentation
+---------------------------------
+
+If you would like to build `the documentation <https://tarantella.readthedocs.io/en/latest/>`_
+locally, run the following ``cmake`` command
+
+.. code-block:: bash
+
+  cmake -DCMAKE_INSTALL_PREFIX=${TARANTELLA_INSTALLATION_PATH} -DBUILD_DOCS=ON ..
+
+before compiling.
+This requires you to have `Sphinx <https://www.sphinx-doc.org/en/master/>`_ installed:
+
+.. code-block:: bash
+
+  pip install -U sphinx
diff --git a/docs/source/license.rst b/docs/source/license.rst
new file mode 100644
index 00000000..b46bf268
--- /dev/null
+++ b/docs/source/license.rst
@@ -0,0 +1,5 @@
+License
+=======
+
+.. literalinclude:: ../../LICENSE
+   :language: text
diff --git a/docs/source/model.py b/docs/source/model.py
new file mode 100644
index 00000000..2845f141
--- /dev/null
+++ b/docs/source/model.py
@@ -0,0 +1,89 @@
+import argparse
+import tensorflow as tf
+from tensorflow import keras
+
+import tarantella as tnt
+tnt.init()
+
+def parse_args():
+  parser = argparse.ArgumentParser()
+  parser.add_argument("-bs", "--batch_size", type=int, default=64)
+  parser.add_argument("-e", "--number_epochs", type=int, default=1)
+  parser.add_argument("-lr", "--learning_rate", type=float, default=0.01)
+  parser.add_argument("-train", "--train_size", type=int, default=48000)
+  parser.add_argument("-val", "--val_size", type=int, default=6400)
+  parser.add_argument("-test", "--test_size", type=int, default=6400)
+  args = parser.parse_args()
+  return args
+
+def mnist_as_np_arrays(training_samples, validation_samples, test_samples):
+  mnist_train_size = 60000
+  mnist_test_size = 10000
+  assert(training_samples + validation_samples <= mnist_train_size)
+  assert(test_samples <= mnist_test_size)
+
+  # load given number of samples
+  (x_train_all, y_train_all), (x_test_all, y_test_all) = \
+        keras.datasets.mnist.load_data()
+  x_train = x_train_all[:training_samples]
+  y_train = y_train_all[:training_samples]
+  x_val = x_train_all[training_samples:training_samples+validation_samples]
+  y_val = y_train_all[training_samples:training_samples+validation_samples]
+  x_test = x_test_all[:test_samples]
+  y_test = y_test_all[:test_samples]
+
+  # normalization and reshape
+  x_train = x_train.reshape(training_samples,28,28,1).astype('float32') / 255.
+  x_val = x_val.reshape(validation_samples,28,28,1).astype('float32') / 255.
+  x_test = x_test.reshape(test_samples,28,28,1).astype('float32') / 255.
+  y_train = y_train.astype('float32')
+  y_val = y_val.astype('float32')
+  y_test = y_test.astype('float32')
+
+  return (x_train, y_train), (x_val, y_val), (x_test, y_test)
+
+def lenet5_model_generator():
+  inputs = keras.Input(shape=(28,28,1,), name='input')
+  x = keras.layers.Conv2D(20, 5, padding="same", activation='relu')(inputs)
+  x = keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)
+  x = keras.layers.Conv2D(50, 5, padding="same", activation='relu')(x)
+  x = keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)
+  x = keras.layers.Flatten()(x)
+  x = keras.layers.Dense(500, activation='relu')(x)
+  outputs = keras.layers.Dense(10, activation='softmax')(x)
+  return keras.Model(inputs=inputs, outputs=outputs)
+
+args = parse_args()
+              
+# Create Tarantella model
+model = tnt.Model(lenet5_model_generator())
+
+# Compile Tarantella model (as with Keras)
+model.compile(optimizer = keras.optimizers.SGD(learning_rate=args.learning_rate),
+              loss = keras.losses.SparseCategoricalCrossentropy(),
+              metrics = [keras.metrics.SparseCategoricalAccuracy()])
+
+# Load MNIST dataset (as with Keras)
+shuffle_seed = 42
+(x_train, y_train), (x_val, y_val), (x_test, y_test) = \
+      mnist_as_np_arrays(args.train_size, args.val_size, args.test_size)
+
+train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+train_dataset = train_dataset.shuffle(len(x_train), shuffle_seed)
+train_dataset = train_dataset.batch(args.batch_size)
+train_dataset = train_dataset.prefetch(buffer_size = tf.data.experimental.AUTOTUNE)
+
+val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
+val_dataset = val_dataset.batch(args.batch_size)
+
+test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+test_dataset = test_dataset.batch(args.batch_size)
+
+# Train Tarantella model (as with Keras)
+model.fit(train_dataset,
+          validation_data = val_dataset,
+          epochs = args.number_epochs,
+          verbose = 1)
+
+# Evaluate Tarantella model (as with Keras)
+model.evaluate(test_dataset, verbose = 1)
diff --git a/docs/source/pics/tnt_logo.png b/docs/source/pics/tnt_logo.png
new file mode 100644
index 00000000..87f475be
Binary files /dev/null and b/docs/source/pics/tnt_logo.png differ
diff --git a/docs/source/pics/tnt_logo_text.png b/docs/source/pics/tnt_logo_text.png
new file mode 100644
index 00000000..4a829d20
Binary files /dev/null and b/docs/source/pics/tnt_logo_text.png differ
diff --git a/docs/source/pics/tnt_run.gif b/docs/source/pics/tnt_run.gif
new file mode 100644
index 00000000..cc39935a
Binary files /dev/null and b/docs/source/pics/tnt_run.gif differ
diff --git a/docs/source/quick_start.rst b/docs/source/quick_start.rst
new file mode 100644
index 00000000..36225ff7
--- /dev/null
+++ b/docs/source/quick_start.rst
@@ -0,0 +1,455 @@
+.. _quick-start-label:
+
+Quick Start
+===========
+
+This section explains how to get started using Tarantella to distributedly
+train an existing TensorFlow 2/Keras model.
+First, we will examine what changes have to be made to your code, before we will look into
+the execution of your script with ``tarantella`` on the command line.
+Finally, we will present the features Tarantella currently supports and
+what important points need to be taken into account when using Tarantella.
+
+Code example: LeNet-5 on MNIST
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+After having :ref:`build and installed <installation-label>` Tarantella
+we are ready to add distributed training support to an existing TensorFlow 2/Keras model.
+We will first illustrate all the necessary steps, using the well-known example of
+**LeNet-5** on the **MNIST** dataset. Although this is not necessarily a good use case
+to take full advantage of Tarantella's capabilities, it will allow you to simply
+copy-paste the code snippets and try them out, even on your laptop.
+  
+**Let's get started!**
+
+.. literalinclude:: quick_start_model.py
+   :language: Python
+   :linenos:
+   :emphasize-lines: 3,9,13
+
+As you can see from the marked lines in the code snippet,
+you only need to add *3 lines of code* to train LeNet-5 distributedly using Tarantella!
+Let us go through the code in some more detail, in order to understand what is going on.
+
+First we need to import the Tarantella library:
+
+.. code-block:: Python
+
+   import tarantella as tnt
+
+Having done that we need to initialize the library (which will setup the communication infrastructure):
+
+.. code-block:: Python
+
+   tnt.init()
+
+Note that this should be done before executing any other code. Next, we need to wrap the
+``keras.Model`` object, generated by ``lenet5_model_generator()``, into a ``tnt.Model`` object:
+
+.. code-block:: Python
+
+   model = tnt.Model(lenet5_model_generator())
+
+**That's it!**
+
+All the necessary steps to distribute training and datasets will now automatically be handled by Tarantella.
+In particular, we still run ``model.compile`` on the new ``model`` to generate a compute graph,
+just as we would have done with a typical Keras model.
+
+Next, we load the MNIST data for training and testing, and
+create ``Dataset`` s from it. Note that we ``batch`` the dataset for training.
+This will guarantee that Tarantella is able to distribute the data later on in the correct way.
+Also note that the ``batch_size`` used here, is the same as for the original model,
+that is the *global* batch size.  For details concerning local and global batch sizes have a look
+:ref:`here <global-vs-local-batch-size-label>`.
+
+Now we are able to train our ``model`` using ``model.fit``, in the same familiar
+way used by the standard Keras interface. Note, however, that Tarantella is taking care of proper
+distribution of the ``train_dataset`` in the background. All the possibilities of how to
+feed datasets to Tarantella are explained in more detail below.
+Lastly, we can evaluate the final accuracy of our ``model`` on the ``test_dataset`` using
+``model.evaluate``.
+
+To test and run ``tarantella`` in the next section, you can find a full version of the above example
+`here <https://github.com/cc-hpc-itwm/tarantella/blob/master/docs/source/model.py>`__.
+
+Executing your model with ``tarantella``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Next, let's execute our model distributedly using ``tarantella`` on the command line.
+The simplest way to do that is by passing the Python script of the model to ``tarantella``:
+
+.. code-block:: bash
+
+   tarantella -- model.py
+
+This will execute our model distributedly on a single node, using all the available GPUs.
+In case no GPUs can be found, ``tarantella`` will executed in serial mode on the CPU,
+and an ``WARNING`` message will be issued. In case you have GPUs available, but
+want to execute ``tarantella`` on CPUs nonetheless, you can specify the ``--no-gpu`` option.
+
+.. code-block:: bash
+
+   tarantella --no-gpu -- model.py
+
+We can also set command line parameters for the python script ``model.py``, which have to
+succeed the name of the script:
+
+.. code-block:: bash
+
+   tarantella --no-gpu -- model.py --batch_size=64 --learning_rate=0.01
+
+On a single node, we can also explicitly specify the number of TensorFlow instances
+we want to use. This is done with the ``-n`` option:
+
+.. code-block:: bash
+
+   tarantella -n 4 -- model.py --batch_size=64
+
+Here, ``tarantella`` would try to execute distributedly on 4 GPUs.
+If there are not enough GPUs available, ``tarantella`` will print a ``WARNING``
+and run 4 instances of TensorFlow on the CPU instead.
+If there are no GPUs installed or the ``--no-gpu`` option is use,
+``tarantella`` will not print a ``WARNING``.
+
+Next, let's run ``tarantella`` on multiple nodes. In order to do this,
+we need to provide ``tarantella`` with a ``hostfile`` that contains
+the ``hostname`` s of the nodes that we want to use:
+
+.. code-block:: bash
+
+   $ cat hostfile
+   name_of_node_1
+   name_of_node_2
+
+With this ``hostfile`` we can run ``tarantella`` on multiple nodes:
+
+.. code-block:: bash
+
+   tarantella --hostfile hostfile -- model.py
+
+In this case, ``tarantella`` uses *all* GPUs it can find.
+If no GPUs are available, ``tarantella`` will start *one* TensorFlow instance
+per node on the CPUs, and will issue an ``WARNING`` message. 
+Again, this can be disabled by explicitly using the ``--no-gpu``
+option.
+
+As before, you can specify the number of GPUs/CPUs used per node
+explicitly with the option ``--n-per-node=<number>``:
+
+.. code-block:: bash
+
+   tarantella --hostfile hostfile --n-per-node=4 --no-gpu -- model.py --batch_size=64
+
+In this example, ``tarantella`` would execute 4 instances of TensorFlow on the CPUs
+of each node specified in ``hostfile``.
+
+.. caution::
+
+   ``tarantella`` requires all the names in the ``hostfile`` be **unique**,
+   and all nodes be **homogeneous** (number and type of CPUs and GPUs).
+
+In addition, ``tarantella`` can be run with different levels of logging output.
+The log-levels that are available are ``INFO``, ``WARNING``, ``DEBUG`` and ``ERROR``,
+and can be set with ``--log-level``:
+
+.. code-block:: bash
+
+   tarantella --hostfile hostfile --log-level=INFO -- model.py
+
+By default, ``tarantella`` will log on the :ref:`master rank <ranks-label>` only.
+This can be changed by using the ``--log-on-all-devices`` option which will print
+log messages for each :ref:`rank <ranks-label>` individually.
+
+Similarly, by default ``tarantella`` will print outputs from functions like ``fit``,
+``evaluate`` and ``predict``, as well as callbacks only on the master rank.
+Sometimes, it might be useful to print outputs from all devices (e.g., for debugging),
+which can be switched on with the ``--output-on-all-devices`` option.
+
+``tarantella`` uses GPI-2's ``gaspi_run`` internally, taking care of ``export`` ing
+environment variables, and generating an execution script from the user inputs.
+Details of this process can be monitored using the ``--dry-run`` option.
+
+Lastly, you can overwrite the *Tensor Fusion* threshold ``tarantella`` uses 
+with ``--fusion-threshold FUSION_THRESHOLD_KB``
+(cf. :ref:`here <tensor-fusion-label>` and :ref:`here <tensor-fusion-threshold-label>`),
+and set and number of environment variables, most notably
+``TNT_TENSORBOARD_ON_ALL_DEVICES``, as explained
+:ref:`here <callbacks-label>`.
+
+Save and load Tarantella models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Storing and loading your trained ``Tarantella.Model`` is very simple.
+
+Tarantella supports all the different ways, in which you can load and store a ``keras.Model``
+(for a guide look for instance `here <https://www.tensorflow.org/guide/keras/save_and_serialize>`__).
+In particular, you can:
+
+* save the whole model (including the architecture, the weights and the state of the optimizer)
+* save the model's architecture/configuration only
+* save the model's weights only
+
+Whole-model saving and loading
+------------------------------
+
+Saving the entire model including the architecture, weights and optimizer can be done via
+
+.. code-block:: python
+
+   model = ...  # get `tnt.Model`
+   model.save('path/to/location')
+
+Alternatively, you could use ``tnt.models.save_model('path/to/location')``, which works
+on both ``keras.Model`` s and ``tnt.Model`` s.
+
+You can than load your model back using
+
+.. code-block:: python
+
+   import tarantella as tnt
+   model = tnt.models.load_model('path/to/location')
+
+which will return an instance of ``tnt.Model``.
+
+.. caution::
+
+   At the moment, you will need to re-compile your model after loading.
+
+This is again done with
+
+.. code-block:: python
+
+   model.compile(optimizer = keras.optimizers.SGD(learning_rate=args.learning_rate),
+                 loss = keras.losses.SparseCategoricalCrossentropy(),
+                 metrics = [keras.metrics.SparseCategoricalAccuracy()])
+
+or similar.
+
+Architecture saving and loading
+-------------------------------
+
+If you only want to save the configuration (that is the architecture) of your model
+(in memory), you can use one of the following functions: 
+
+* ``tnt.Model.get_config``
+* ``tnt.Model.to_json``
+* ``tnt.Model.to_yaml``
+
+The architecture without its original weights and optimizer can then be restored
+using:
+
+* ``tnt.models.model_from_config`` / ``tnt.Model.from_config``
+* ``tnt.models.model_from_json``
+* ``tnt.models.model_from_yaml``
+
+respectively.
+Here is an example:
+
+.. code-block:: python
+
+   import tarantella as tnt
+   model = ...  # get `tnt.Model`
+   config = model.get_config()
+   new_model = tnt.models.model_from_config(config)
+
+The same can be achieved through cloning:
+
+.. code-block:: python
+
+   import tarantella as tnt
+   model = ...  # get `tnt.Model`
+   new_model = tnt.models.clone_model(model)
+
+
+Weights saving and loading
+--------------------------
+
+Storing and loading the weights of a model to/from memory can be done
+using the functions ``tnt.Model.get_weights`` and ``tnt.Model.set_weights``,
+respectively. Saving and loading weights to/from disk is done
+using the functions ``tnt.Model.save_weights`` and ``tnt.Model.load_weights``,
+respectively.
+
+Here is an example how this can be used to restore a model:
+
+.. code-block:: python
+
+   import tarantella as tnt
+   model = ...  # get `tnt.Model`
+   config = model.get_config()
+   weights = model.get_weights()
+
+   # initialize a new model with original model's weights
+   new_model = tnt.models.model_from_config(config)
+   new_model.set_weights(weights)
+
+.. _checkpointing-via-callbacks-label:
+
+Checkpointing via callbacks
+---------------------------
+
+Apart from saving and loading models manually, Tarantella also supports checkpointing
+via Keras' ``ModelCheckpoint`` callback, as it is described for instance
+`here <https://www.tensorflow.org/guide/keras/train_and_evaluate#checkpointing_models>`__.
+
+.. code-block:: python
+
+   import tensorflow as tf
+   import tarantella as tnt
+
+   model = ...  # get `tnt.Model`
+
+   checkpoint_path = 'path/to/checkpoint/location'
+   model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+     filepath=checkpoint_path, monitor='val_acc', verbose=1, save_best_only=False,
+     save_weights_only=False, mode='auto', save_freq='epoch', options=None)
+
+   model.fit(train_dataset,
+             validation_data = val_dataset,
+             epochs = 2,
+             callbacks = [model_checkpoint_callback])
+
+
+.. note::
+
+  All saving to the filesystem (including ``tnt.Model.save`` and ``tnt.Model.save_weights``)
+  by Tarantella will only be done on the master rank.
+
+This is the default and will yield correct behavior when you are using a distributed filesystem.
+If you wish to explicitly save on all devices you can pass ``tnt_save_all_devices = True``
+to ``tnt.Model.save``, ``tnt.Model.save_weights`` and ``tnt.models.save_model``.
+
+
+.. _using-distributed-datasets-label:
+
+Using distributed datasets
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This section explains what needs to be done in order to use Tarantella's distributed datasets correctly.
+
+The recommended way in which to provide your dataset to Tarantella is by passing a
+*batched* ``tf.data.Dataset`` to ``tnt.Model.fit``.
+In order to do this, create a ``Dataset`` and apply the ``batch``
+`transformation <https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch>`_
+using the (global) batch size to it. However, do not provide a value to ``batch_size``
+in ``tnt.Model.fit``, which would lead to double batching, and thus modified shapes
+for the input data.
+
+Tarantella also supports batched and unbatched ``Dataset`` s in ``tnt.Model.fit``
+when setting the ``tnt_micro_batch_size`` argument. This can be useful to obtain
+maximal performance in multi-node execution, as explained
+:ref:`here <using-local-batch-sizes-label>`. Keep in mind however, that Tarantella still expects
+the ``Dataset`` to be batched with the global batch size, and that the micro-batch
+size has to be consistent with the global batch size. [#footnote_consistent]_
+This is why, it is recommended to use an unbatched ``Dataset`` when setting
+a ``tnt_micro_batch_size`` explicitly.
+
+Tarantella does not support any other way to feed data to ``fit`` at the moment.
+In particular, Numpy arrays, TensorFlow tensors and generators are not supported.
+
+Tarantella's automatic data distribution can be switched off by passing
+``tnt_distribute_dataset=False`` in ``tnt.Model.fit``, in which case Tarantella
+will issue an ``INFO`` message.
+If a validation dataset is passed to ``tnt.Model.fit``, it should also be batched
+with the global batch size. You can similarly switch off its automatic 
+micro-batching mechanism by setting ``tnt_distribute_validation_dataset=False``.
+
+There are a few important points when using distributed datasets in Tarantella:
+
+.. note::
+
+   Batch size must be a multiple of the number of devices used.
+
+This issue will be fixed in the next release.
+
+.. note::
+
+   The last incomplete batch is always dropped.
+
+We recommend to use ``drop_remainder=True`` when generating a ``Dataset``.
+If ``drop_remainder`` is set to ``False``, Tarantella will ignore it
+and issue a ``WARNING`` message. This behavior will be fixed in the next release.
+
+.. note::
+
+     When using ``shuffle`` without a ``seed``, Tarantella will use a fixed default ``seed``.
+
+This guarantees that the input data is shuffled the same way on all devices,
+when no ``seed`` is given, which is necessary for consistency.
+However, when a random ``seed`` is provided by the user, Tarantella will use that one instead.
+
+.. _callbacks-label:
+
+Callbacks
+^^^^^^^^^
+
+At the moment, Tarantella fully supports 3 of the
+`Keras callbacks <https://www.tensorflow.org/api_docs/python/tf/keras/callbacks>`__:
+
+* ``tf.keras.callbacks.LearningRateScheduler``
+* ``tf.keras.callbacks.ModelCheckpoint``
+* ``tf.keras.callbacks.TensorBoard``
+
+The ``LearningRateScheduler`` takes a ``schedule`` which will change the learning rate
+on each of the devices used (for detailed explanation, cf.
+`here <https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/LearningRateScheduler>`__
+and
+`here <https://www.tensorflow.org/guide/keras/train_and_evaluate#using_learning_rate_schedules>`__
+).
+If ``verbose=1`` is set, Tarantella will only print on one device by default.
+This behavior can be changed by passing ``--output-on-all-devices`` to ``tarantella``.
+
+``ModelCheckpoint`` can be used to automatically checkpoint the state of the model
+during training. For an example look :ref:`here <checkpointing-via-callbacks-label>`,
+and into the
+`Keras documentation <https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint>`__.
+
+The ``TensorBoard`` callback can be used to collect training information for visualization
+in `TensorBoard <https://www.tensorflow.org/tensorboard>`__. By default, Tarantella
+will only collect (device local) information on one device. If you want to collect
+the local information on all devices use the environment variable ``TNT_TENSORBOARD_ON_ALL_DEVICES``:
+
+.. code-block:: bash
+
+   TNT_TENSORBOARD_ON_ALL_DEVICES=true tarantella -- model.py
+
+.. note::
+
+   At the moment, all of the other Keras callbacks will be executed on all devices with
+   local information only.
+
+For instance, the ``BaseLogger`` callback will be executed on each and every rank,
+and will log the acculumated metric averages for the local (micro-batch) information.
+
+Important points
+^^^^^^^^^^^^^^^^
+
+There is a number of points you should be aware of when using Tarantella.
+
+.. note::
+
+   ``tnt.init()`` needs to be called **after** ``import tarantella as tnt``, but **before**
+   any other statement.
+
+This will make sure the GPI-2 communication infrastructure is correctly initialized.
+
+.. note::
+
+   Tarantella does not support custom training loops.
+
+Instead of using custom training loops, please use ``Model.fit(...)``.
+
+.. note::
+
+   Tarantella supports all
+   `TensorFlow optimizers <https://www.tensorflow.org/api_docs/python/tf/keras/optimizers>`_
+   with the exception of ``tf.keras.optimizers.Ftrl``.
+
+Since the ``Ftrl`` optimizer does not use batches, it is not supported in Tarantella.
+
+
+.. rubric:: Footnotes
+
+.. [#footnote_consistent] That is, the global batch size must equal the micro batch size times
+   the number of devices used.
diff --git a/docs/source/quick_start_model.py b/docs/source/quick_start_model.py
new file mode 100644
index 00000000..7a345bae
--- /dev/null
+++ b/docs/source/quick_start_model.py
@@ -0,0 +1,39 @@
+import tensorflow as tf
+from tensorflow import keras
+import tarantella as tnt
+
+# Skip function implementations for brevity
+[...]
+
+# Initialize Tarantella (before doing anything else)
+tnt.init()
+args = parse_args()
+              
+# Create Tarantella model
+model = tnt.Model(lenet5_model_generator())
+
+# Compile Tarantella model (as with Keras)
+model.compile(optimizer = keras.optimizers.SGD(learning_rate=args.learning_rate),
+              loss = keras.losses.SparseCategoricalCrossentropy(),
+              metrics = [keras.metrics.SparseCategoricalAccuracy()])
+
+# Load MNIST dataset (as with Keras)
+shuffle_seed = 42
+(x_train, y_train), (x_val, y_val), (x_test, y_test) = \
+      mnist_as_np_arrays(args.train_size, args.val_size, args.test_size)
+
+train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+train_dataset = train_dataset.shuffle(len(x_train), shuffle_seed)
+train_dataset = train_dataset.batch(args.batch_size)
+train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
+
+test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+test_dataset = test_dataset.batch(args.batch_size)
+
+# Train Tarantella model (as with Keras)
+model.fit(train_dataset,
+          epochs = args.number_epochs,
+          verbose = 1)
+
+# Evaluate Tarantella model (as with Keras)
+model.evaluate(test_dataset, verbose = 1)
diff --git a/docs/source/tutorials.rst b/docs/source/tutorials.rst
new file mode 100644
index 00000000..c7c9b189
--- /dev/null
+++ b/docs/source/tutorials.rst
@@ -0,0 +1,304 @@
+Tutorials
+=========
+
+This section delves into more advanced usage of Tarantella with the help of 
+state-of-the-art models for two widely-used applications in Deep Learning:
+
+* Image classification: ResNet-50
+* Machine translation: Transformer
+
+The models shown here are adapted from the 
+`TensorFlow Model Garden <https://github.com/tensorflow/models/tree/master/official>`_.
+While the model implementations and hyperparameters are unchanged to preserve
+compatibility with the TensorFlow official models, we provide simplified training
+schemes that allow for a seemless transition from basic serial training to distributed 
+data parallelism using Tarantella.
+
+
+Prerequisites
+-------------
+
+The tutorial models can be downloaded from the 
+`Tnt Models repository <https://github.com/cc-hpc-itwm/tarantella_models>`_
+
+.. code-block:: bash
+
+    export TNT_MODELS_PATH=/your/installation/path
+    cd ${TNT_MODELS_PATH}
+    git clone https://github.com/cc-hpc-itwm/tarantella_models
+
+To use these models, install the the following dependencies:
+
+* TensorFlow 2.2.1
+* Tarantella 0.6.0
+
+For a step-by-step installation, follow the :ref:`installation-label` guide.
+In the following we will assume that TensorFlow was installed in a ``conda`` 
+environment called ``tarantella``.
+
+Now we can install the final dependency,
+`TensorFlow official Model Garden <https://github.com/tensorflow/models>`__:
+
+.. code-block:: bash
+
+    conda activate tarantella
+    pip install tf-models-official==2.2.1
+
+
+.. _resnet50-label:
+
+ResNet-50
+---------
+
+Deep Residual Networks (ResNets) represented a breakthrough in the field of
+computer vision, enabling deeper and more complex deep convolutional networks.
+Introduced in [He]_, ResNet50 has become a standard model for image classification 
+tasks, and has be shown to scale to very large number of nodes in data parallel 
+training [Goyal]_.
+
+Run Resnet-50 with Tarantella
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Before running the model, we need to add it to the existing ``PYTHONPATH``.
+
+.. code-block:: bash
+
+    export PYTHONPATH=${TNT_MODELS_PATH}/models/resnet:${PYTHONPATH}
+
+Furthermore, the ``ImageNet`` dataset needs to be installed and available on 
+all the nodes that we want to use for training.
+TensorFlow provides convenience scripts to download datasets, in their ``datasets``
+package that is installed as a dependency for the TensorFlow Model Garden.
+Install ImageNet to your local machine as described 
+`here <https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/scripts/download_and_prepare.py>`_.
+
+.. code-block:: bash
+
+    export TNT_DATASETS_PATH=/path/to/downloaded/datasets
+
+    python -m tensorflow_datasets.scripts.download_and_prepare \
+    --datasets=imagenet2012 --data_dir=${TNT_DATASETS_PATH}
+
+
+Let's assume we have access to two nodes (saved in ``hostfile``) equipped with 4 GPUs each.
+We can now simply run the ResNet-50 as follows:
+
+.. code-block:: bash
+
+    tarantella --hostfile ./hostfile --devices-per-node 4 \
+    -- ${TNT_MODELS_PATH}/models/resnet/resnet50_tnt.py --data_dir=${TNT_DATASETS_PATH} \
+                                                        --batch_size=512 \
+                                                        --train_epochs=90 \
+                                                        --epochs_between_evals=10 
+
+The above command will train a ResNet-50 models on the 8 devices available in parallel 
+for ``90`` epochs, as suggested in [Goyal]_ to achieve convergence.
+The ``--epochs_between_evals`` parameter specifies the frequency of evaluations of the 
+``validation`` data performed in between training epochs.
+
+Note the ``--batch_size`` parameter, which specifies the global batch size used in training.
+
+Implementation overview
+^^^^^^^^^^^^^^^^^^^^^^^
+We will now look closer into the implementation of the ResNet-50 training scheme. 
+The main training steps reside in the ``models/resnet/resnet50_tnt.py`` file.
+
+The most important step in enabling data parallelism with Tarantella is 
+to wrap the Keras model:
+
+.. code-block:: python
+
+    model = resnet_model.resnet50(num_classes=tf_imagenet_preprocessing.NUM_CLASSES)
+    model = tnt.Model(model)
+
+The following operations can be used for training the model serially, as they do not 
+require any change.
+In particular, the ImageNet dataset is loaded and preprocessed as follows:
+
+.. code-block:: python
+
+    train_dataset = imagenet_preprocessing.input_fn(is_training=True,
+                                                    data_dir=flags_obj.data_dir,
+                                                    batch_size=flags_obj.batch_size,
+                                                    shuffle_seed = 42,
+                                                    drop_remainder=True)
+
+The 
+`imagenet_preprocessing.input_fn
+<https://github.com/cc-hpc-itwm/tarantella_models/blob/master/src/models/resnet/imagenet_preprocessing.py>`_
+function takes the input files in ``data_dir``, loads the training samples and processes 
+them into TensorFlow datasets.
+
+The user only needs to pass the global ``batch_size`` value, and the Tarantella 
+framework will ensure that the dataset is properly distributed among devices,
+such that:
+
+  * each device will process an independent set of samples
+  * each device will group the samples into micro batches, where the micro-batch
+    size will be computed as ``batch_size / num_devices``
+  * each device will apply the same set of transformation to its imput samples as 
+    specified in the ``input_fn`` function.
+
+Before starting the training, the model is compiled to use a standard Keras optimizer 
+and loss.
+
+.. code-block:: python
+
+    model.compile(optimizer=optimizer,
+                  loss='sparse_categorical_crossentropy',
+                  metrics=(['sparse_categorical_accuracy']))
+
+We provide flags to enable the most commonly used Keras ``callbacks``, such as 
+the ``TensorBoard`` profiler, which can simply be passed to the ``fit`` function 
+of the Tarantella model.
+
+.. code-block:: python
+
+    callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=flags_obj.model_dir,
+                                                    profile_batch=2))
+
+If model checkpointing is required, it can be enabled through the ``ModelCheckpoint`` 
+callback as usual (cf. :ref:`checkpointing models with Tarantella <checkpointing-via-callbacks-label>`).
+
+.. code-block:: python
+
+    callbacks.append(tf.keras.callbacks.ModelCheckpoint(ckpt_full_path, save_weights_only=True))
+
+
+There is no need for any further changes to proceed with training:
+
+.. code-block:: python
+
+    history = model.fit(train_dataset,
+                        epochs=flags_obj.train_epochs,
+                        callbacks=callbacks,
+                        validation_data=validation_dataset,
+                        validation_freq=flags_obj.epochs_between_evals,
+                        verbose=1)
+
+.. todo::
+
+   Advanced topics:
+
+   * scaling batch size with number of ranks (-> only mention here & link to advanced topics)
+   * introduce learning rate warm up
+   * introduce learning rate scaling (with #ranks)
+
+
+.. _transformer-label:
+
+Transformers
+------------
+
+The Transformer is a Deep Neural Network widely used in the field of natural language processing (NLP),
+in particular for tasks such as machine translation.
+It was first proposed by [Vaswani]_.
+
+Run the Transformer with Tarantella
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The Tranformer training scheme can be found
+`here <https://github.com/cc-hpc-itwm/tarantella_models/blob/master/src/models/transformer/transformer_tnt.py>`__,
+and has to be added to 
+the existing ``PYTHONPATH``:
+
+.. code-block:: bash
+
+    export PYTHONPATH=${TNT_MODELS_PATH}/models/transformer:${PYTHONPATH}
+
+We will follow the training procedure presented in [Vaswani]_, where the authors 
+show results for training the `big` variant of the Transformer model on 
+a machine translation dataset called 
+`WMT14 <http://www.statmt.org/wmt14/translation-task.html>`_.
+
+To install the dataset, we will use the Tensorflow ``datasets`` package, which 
+should have been already installed in your ``conda`` environment as a 
+dependency for the TensorFlow Model Garden, and download the English-German 
+dataset to match the results by [Vaswani]_.
+Detailed instructions on how to obtain the dataset are provided in the 
+`TensorFlow documentation <https://www.tensorflow.org/datasets/catalog/wmt14_translate>`_.
+
+Now we can start training.
+Once again, let's assume we have access to two nodes (specified in ``hostfile``)
+equipped with 4 GPUs each.
+
+.. code-block:: bash
+
+    export WMT14_PATH=/path/to/the/installed/dataset
+
+    tarantella --hostfile ./hostfile --devices-per-node 4 \
+    -- ${TNT_MODELS_PATH}/models/transformer/transformer_tnt.py \
+                         --data_dir=${WMT14_PATH} \
+                         --vocab_file=${WMT14_PATH}/vocab.ende.32768     
+                         --bleu_ref=${WMT14_PATH}/newstest2014.de 
+                         --bleu_source=${WMT14_PATH}/newstest2014.en 
+                         --param_set=big 
+                         --train_epochs=30
+                         --batch_size=32736
+
+The above command will select the ``big`` model implementation and train it
+distributedly on the 8 specified devices.
+To reach the target accuracy, [Vaswani]_ specifies that the model needs to be 
+trained for ``30`` epochs.
+
+The Transformer requires access to a vocabulary file, which contains all the
+tokens derived from the dataset. This is provided as the ``vocab_file`` parameter
+and is part of the pre-processed dataset.
+
+After training, one round of evaluation is conducted using the ``newstest2014``
+dataset to translate English sentences into German.
+
+Implementation overview
+^^^^^^^^^^^^^^^^^^^^^^^
+
+The Transformer model itself is implemented and imported from the 
+`TensorFlow Model Garden 
+<https://github.com/tensorflow/models/tree/master/official/nlp/transformer>`__.
+The training procedure and dataset loading and pre-processing do not require
+extensive changes to work with Tarantella. However, we provide a simplified 
+version to highlight the usage of Tarantella with Keras training loops.
+
+Thus, the Keras transformer model is created in
+``models/transformer/transformer_tnt.py`` and wrapped into a Tarantella model:
+
+.. code-block:: python
+
+    model = resnet_model.resnet50(num_classes=tf_imagenet_preprocessing.NUM_CLASSES)
+    model = tnt.Model(model)
+
+Data is loaded as follows, without any specific modification to trigger 
+distributed training:
+
+.. code-block:: python
+
+    train_ds = data_pipeline.train_input_fn(self.params)
+
+Here, the ``data_pipeline.train_input_fn`` reads in the dataset and applies a series 
+of transformations to convert it into a batched set of sentences.
+The advantage of using the *automatic dataset distribution* mechanism of Tarantella
+is that users can reason about their I/O pipeline without taking care of the details
+about how to distribute it.
+Note however, that the batch size has to be a multiple of the number of ranks, so
+that it can be efficiently divided into micro-batches.
+
+Next, the user can also create callbacks, which can then be simply passed on to
+the training function.
+
+.. code-block:: python
+
+  callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=self.flags_obj.model_dir))
+
+Finally, we can call ``model.fit`` to start distributed training on all devices:
+
+.. code-block:: python
+
+    history = model.fit(train_ds,
+                        epochs=self.params["train_epochs"],
+                        callbacks=callbacks,
+                        verbose=1)
+
+.. todo::
+
+   Important points
+   
+   * Mixing Keras and Tarantella models
+
diff --git a/docs/source/why_tarantella.rst b/docs/source/why_tarantella.rst
new file mode 100644
index 00000000..4313dd35
--- /dev/null
+++ b/docs/source/why_tarantella.rst
@@ -0,0 +1,44 @@
+Why Tarantella?
+===============
+
+Tarantella is an open-source Deep Learning framework that focuses on providing fast, scalable and
+efficient training of Deep Neural Networks (DNNs) on High Performance Computing (HPC) clusters.
+
+Goals
+-----
+
+Tarantella is designed to meet the following goals:
+
+.. code-block:: text
+
+  Tarantella...
+
+    1. ...provides strong scalability
+    2. ...is easy to use
+    3. ...follows a synchronous training scheme
+    4. ...integrates well with existing models
+    5. ...provides support for GPU and CPU systems
+
+Tarantella provides close to linear speed-up for the training of common Deep Learning architectures,
+thus considerably reducing the required time-to-accuracy in many Deep Learning workflows.
+To make this capability accessible to as many users as possible, Tarantella's interface
+is designed such that its use does not require any expertise in HPC or parallel computing.
+
+To allow integrating Tarantella into any TensorFlow-based Deep Learning workflow,
+we put special emphasis on strictly following the synchronous optimization scheme
+used to train DNNs. This guarantees that results obtained in serial execution can be
+reproduced when using distributed training
+(cf. however :ref:`these guidelines <points-to-consider-label>`),
+so that computation can be scaled up at any point in time without losing reproducibility
+of the results.
+
+Furthermore, we made sure that existing TensorFlow 2/Keras
+models can be made ready for distributed training with minimal effort
+(follow the :ref:`Quick Start guide <quick-start-label>` to learn more).
+Tarantella supports distributed training on GPU and pure CPU clusters,
+independently of the hardware vendors.
+
+.. todo::
+
+   Performance Results
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 00000000..4870d072
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(TNT_PYTHON_DIRS
+    ${SRC_DIR}/tarantella
+    ${SRC_DIR}/runtime
+    ${SRC_DIR}/gpi_comm_lib/tf_ops/tnt_tfops)
+
+install(DIRECTORY ${TNT_PYTHON_DIRS}
+        DESTINATION ${INSTALL_LIB_DIR}/python
+        FILES_MATCHING PATTERN "*.py")
+
+install(PROGRAMS ${SRC_DIR}/bin/tarantella
+        DESTINATION ${INSTALL_BIN_DIR})
+
+set(VERSION_FILE_TEMPLATE ${CMAKE_SOURCE_DIR}/cmake/version.py.in)
+set(VERSION_FILE ${CMAKE_BUILD_DIR}/version.py)
+configure_file(${VERSION_FILE_TEMPLATE} ${VERSION_FILE} @ONLY)
+install(FILES ${VERSION_FILE}
+        DESTINATION ${INSTALL_LIB_DIR}/python)
diff --git a/src/bin/tarantella b/src/bin/tarantella
new file mode 100755
index 00000000..950e3531
--- /dev/null
+++ b/src/bin/tarantella
@@ -0,0 +1,198 @@
+#!/usr/bin/env python
+import argparse
+import logging
+import os
+import shutil
+import subprocess
+import sys
+
+TNT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+LIB_DIR = os.path.join(TNT_DIR, "lib/tarantella")
+PYLIB_DIR = os.path.join(TNT_DIR, "lib/tarantella/python")
+sys.path.insert(0, LIB_DIR)
+sys.path.insert(0, PYLIB_DIR)
+
+try:
+  from version import tnt_version
+except:
+  tnt_version = "Unknown version"
+
+try:
+  import runtime
+except ModuleNotFoundError as e:
+  raise RuntimeError("[TNT_CLI] Cannot find Tarantella `runtime` module; \
+make sure the `tarantella` script is started from an installed version.") from e
+
+import runtime.file_management as file_man
+import runtime.logging_config as logging_config
+import runtime.platform_config as platform_config
+import runtime.environment_config as env_config
+from runtime import logger
+
+def parse_args():
+  parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter)
+  singlenode_group = parser.add_argument_group('Single-node execution')
+  singlenode_group.add_argument("-n",
+                    help="number of TensorFlow instances to start on the local node",
+                    dest = "npernode",
+                    metavar = "N",
+                    type = int,
+                    default = None)
+  multinode_group = parser.add_argument_group('Multi-node execution')
+  multinode_group.add_argument("--hostfile",
+                      dest = "hostfile",
+                      help="path to the list of nodes (hostnames) on which to execute the SCRIPT",
+                      default = None)
+  multinode_group.add_argument("--n-per-node", "--devices-per-node",
+                    help="number of devices (i.e., either GPUs or processes on CPUs) to be used on each node",
+                    dest = "npernode",
+                    type = int,
+                    default = None)
+
+  parser.add_argument("--no-gpu", "--no-gpus",
+                    help="disallow GPU usage",
+                    dest = "use_gpus",
+                    action='store_false',
+                    default = True)
+  parser.add_argument("--output-on-all-devices",
+                    help="enable output on all devices (e.g., training info)",
+                    dest = "output_all",
+                    action='store_true',
+                    default = False)
+  parser.add_argument("--log-on-all-devices",
+                    help="enable library logging messages on all devices",
+                    dest = "log_all",
+                    action='store_true',
+                    default = False)
+  log_levels = ('DEBUG', 'INFO', 'WARNING', 'ERROR')
+  parser.add_argument('--log-level', default='WARNING', choices=log_levels,
+                      help = "logging level for library messages")
+  parser.add_argument("--fusion-threshold",
+                    help="tensor fusion threshold [kilobytes]",
+                    dest = "fusion_threshold_kb",
+                    type = int,
+                    default = None)
+  parser.add_argument("--dry-run",
+                    help="print generated files and execution command",
+                    dest = "dry_run",
+                    action='store_true',
+                    default = False)
+  parser.add_argument("--version",
+                    action='version',
+                    version=generate_version_message())
+  parser.add_argument('script', nargs='+',metavar='-- SCRIPT')
+  args = parser.parse_args()
+  return args
+
+def tnt_run_message(command_list, hostfile_path, exec_script_path):
+  msg = ""
+  if not hostfile_path is None:
+    msg += "\n{}\nGenerated hostfile:\n".format("="*80)
+    with open(hostfile_path, 'r') as f:
+      msg += "============= {} =============\n{}\n".format(hostfile_path,
+                                                           "".join(f.readlines()))
+  if not exec_script_path is None:
+    msg += "\n{}\nGenerated script:\n".format("="*80)
+    with open(exec_script_path, 'r') as f:
+      msg += "============= {} =============\n{}\n".format(exec_script_path,
+                                                           "".join(f.readlines()))
+  msg += "\n{}".format("="*80)
+  msg += "\nCommand:\n\t{}\n".format(" ".join(command_list))
+  return msg
+
+def generate_dry_run_message(command_list, hostfile_path, exec_script_path):
+  msg = "\n{}".format("="*80)
+  msg += "\n{0}{1}DRY RUN {1}{0}\n".format("="*6, " "*30)
+  msg += tnt_run_message(command_list, hostfile_path, exec_script_path)
+  return msg
+
+def generate_run_error_message(e, hostfile_path = None,
+                               executed_script_path = None):
+  error_string = ""
+  if not e.stdout is None:
+    error_string += "============= STDOUT =============\n{}\n".format(e.stdout)
+  if not e.stderr is None:
+    error_string += "============= STDERR =============\n{}\n".format(e.stderr)
+  error_string += tnt_run_message(e.cmd, hostfile_path = hostfile_path,
+                                  exec_script_path = executed_script_path)
+  error_string += "[TNT_CLI] Execution failed with status {}".format(e.returncode)
+  return error_string
+
+def generate_version_message():
+  msg = ["Tarantella {}".format(tnt_version),
+         "Path: {}".format(os.path.dirname(os.path.abspath(__file__))),
+         "Copyright (C) 2020 Fraunhofer"]
+  return "\n".join(msg)
+
+class Tarantella:
+  def __init__(self, hostlist, num_gpus_per_node, num_cpus_per_node, args):
+    self.args = args
+
+    self.hostlist = hostlist
+    self.command_list = args.script
+    self.num_gpus_per_node = num_gpus_per_node
+
+    # compute number of ranks per node to create the hostfile
+    npernode = num_gpus_per_node
+    device_type = "GPUs"
+    if npernode == 0:
+      npernode = num_cpus_per_node
+      device_type = "CPU processes"
+
+    self.nranks = len(hostlist) * npernode
+    self.hostfile = file_man.HostFile(self.hostlist, npernode)
+    self.executable_script = self.generate_executable_script()
+
+    logger.info("Starting Tarantella on {} devices ({} nodes x {} {})".format(self.nranks,
+                 len(self.hostlist), npernode, device_type))
+
+
+  def generate_executable_script(self):
+    # create execution script
+    header = "#!/bin/bash\n"
+    header += "cd {}".format(os.path.abspath(os.getcwd()))
+
+    environment = env_config.gen_exports_from_dict(env_config.collect_environment_variables()) + \
+                  env_config.gen_exports_from_dict(env_config.collect_tensorflow_variables()) + \
+                  env_config.gen_exports_from_dict(env_config.collect_tarantella_variables()) + \
+                  env_config.gen_exports_from_dict(env_config.get_tnt_variables_from_args(self.args)) +\
+                  env_config.gen_exports_from_dict(env_config.get_tnt_gpus(self.num_gpus_per_node))
+
+    command = "python {}".format(' '.join(self.command_list))
+    return file_man.GPIScriptFile(header, environment, command, dir = os.getcwd())
+
+  def run(self, dry_run = False):
+    with self.hostfile, self.executable_script:
+      command_list = ["gaspi_run", "-n", str(self.nranks),
+                      "-m", self.hostfile.name,
+                      self.executable_script.filename]
+
+      if dry_run:
+        print(generate_dry_run_message(command_list, self.hostfile.name,
+                                       self.executable_script.filename))
+        return
+
+      path_to_gpi = shutil.which("gaspi_run")
+      if path_to_gpi is None:
+        sys.exit("[TNT_CLI] Cannot execute `gaspi_run`; make sure it is added to the current `PATH`.")
+
+      try:
+        result = subprocess.run(command_list,
+                    check = True,
+                    cwd = os.getcwd(),
+                    stdout = None, stderr = None,)
+      except subprocess.CalledProcessError as e:
+        sys.exit(generate_run_error_message(e, self.hostfile.name,
+                                            self.executable_script.filename))
+
+if __name__ == "__main__":
+  args = parse_args()
+  logging_config.setup_logging(logger, args.log_level)
+
+  nodes_list = platform_config.generate_nodes_list(args.hostfile)
+  num_gpus, num_cpus = platform_config.generate_num_devices_per_node(npernode = args.npernode,
+                                                                     use_gpus = args.use_gpus)
+  env_config.update_environment_paths(LIB_DIR)
+
+  tarantella = Tarantella(nodes_list, num_gpus, num_cpus, args)
+  tarantella.run(args.dry_run)
\ No newline at end of file
diff --git a/src/gpi_comm_lib/AtomicCondition.hpp b/src/gpi_comm_lib/AtomicCondition.hpp
new file mode 100644
index 00000000..99fc54d3
--- /dev/null
+++ b/src/gpi_comm_lib/AtomicCondition.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+
+class AtomicCondition
+{
+  public:
+    void notify()
+    {
+      {
+        std::lock_guard<std::mutex> lk(lock);
+        done = true;
+      }
+      condition.notify_one();
+    }
+
+    void wait()
+    {
+      std::unique_lock<std::mutex> lk(lock);
+      condition.wait(lk, [&done = done]{return done;});
+      done = false;
+    }
+
+  private:
+    std::mutex lock;
+    std::condition_variable condition;
+    bool done;
+};
\ No newline at end of file
diff --git a/src/gpi_comm_lib/CMakeLists.txt b/src/gpi_comm_lib/CMakeLists.txt
new file mode 100644
index 00000000..f31227f3
--- /dev/null
+++ b/src/gpi_comm_lib/CMakeLists.txt
@@ -0,0 +1,34 @@
+include (add_macros)
+
+set(GPI_LIB_MODULE "GPICommLib")
+
+set (GPICOMMLIB_SOURCES
+    ${SRC_DIR}/gpi_comm_lib/distribution/SegmentIDBuilder.cpp
+    ${SRC_DIR}/gpi_comm_lib/distribution/utilities.cpp
+    ${SRC_DIR}/gpi_comm_lib/PipelineCommunicator.cpp
+    ${SRC_DIR}/gpi_comm_lib/SynchCommunicator.cpp
+    ${SRC_DIR}/gpi_comm_lib/TensorBroadcaster.cpp
+)
+
+extended_add_library(NAME gpicommlib
+            NAMESPACE tnt
+            TYPE SHARED
+            SOURCES
+                ${GPICOMMLIB_SOURCES}
+            LIBRARIES
+                tnt::gpiresources
+                tnt::collectives
+            INCLUDE_DIRECTORIES
+                ${SRC_DIR}/gpi_comm_lib/
+            INSTALL
+            INSTALL_DESTINATION
+                ${INSTALL_LIB_DIR}
+            POSITION_INDEPENDENT)
+
+pybind11_add_module(${GPI_LIB_MODULE} MODULE
+                    ${SRC_DIR}/gpi_comm_lib/pybind11_wrappers.cpp)
+target_link_libraries(${GPI_LIB_MODULE} PRIVATE pybind11::module
+                                                tnt::gpicommlib)
+install(TARGETS ${GPI_LIB_MODULE}
+        LIBRARY
+        DESTINATION ${INSTALL_LIB_DIR})
diff --git a/src/gpi_comm_lib/PipelineCommunicator.cpp b/src/gpi_comm_lib/PipelineCommunicator.cpp
new file mode 100644
index 00000000..cf4c328d
--- /dev/null
+++ b/src/gpi_comm_lib/PipelineCommunicator.cpp
@@ -0,0 +1,116 @@
+#include "PipelineCommunicator.hpp"
+
+#include "collectives/barrier/GPIBarrier.hpp"
+#include "distribution/GroupBuilder.hpp"
+#include "gpi/gaspiCheckReturn.hpp"
+
+#include <GASPI.h>
+
+#include <cstring>
+
+namespace tarantella
+{
+  PipelineCommunicator::PipelineCommunicator(
+    GPI::Context& context,
+    std::unordered_map<ConnectionID, ConnectionInfo> const& connection_infos,
+    std::size_t num_micro_batches)
+  : resource_manager(context.get_resource_manager())
+  {
+    for(auto const& [conn_id, conn_info] : connection_infos)
+    {
+      auto const segment_id = conn_info.segment_id;
+      auto const buffer_size = conn_info.microbatched_buffer_size_bytes;
+      auto const segment_size = 2 * num_micro_batches * buffer_size;
+    
+      auto const segment_group = resource_manager.make_group({context.get_rank(), conn_info.other_rank});
+      resource_manager.make_segment_resources(segment_id, segment_group, segment_size);
+
+      std::vector<GPI::SegmentBuffer> send_bufs;
+      std::vector<GPI::SegmentBuffer> recv_bufs;
+      std::vector<GPI::NotificationManager::NotificationID> notifications;
+      for(std::size_t m_id = 0; m_id < num_micro_batches; ++m_id)
+      {
+        send_bufs.push_back(resource_manager.get_buffer_of_size(segment_id, buffer_size));
+        recv_bufs.push_back(resource_manager.get_buffer_of_size(segment_id, buffer_size));
+        notifications.push_back(resource_manager.get_notification_range(segment_id, 1).first);
+      }
+      connections.emplace(conn_id, SendRecvResources(conn_info.other_rank,
+                                                     send_bufs,
+                                                     recv_bufs,
+                                                     notifications));
+    }
+
+    // Barrier is required, to ensure all ranks have finished registering
+    // their segments to their communication partners
+    collectives::Barrier::GPIBarrierAllRanks barrier;
+    barrier.blocking_barrier();
+  }
+
+  void PipelineCommunicator::non_blocking_send(void* local_send_buf,
+                                               ConnectionID conn_id,
+                                               MicrobatchID micro_id)
+  {
+    auto const& local_segment_buf = connections[conn_id].send_bufs[micro_id];
+    auto const& remote_segment_buf = connections[conn_id].recv_bufs[micro_id];
+
+    copy_data_to_segment(local_send_buf, local_segment_buf);
+
+    GPI::gaspiCheckReturn(
+      gaspi_write_notify(local_segment_buf.get_segment_id(),
+                         local_segment_buf.get_offset(),
+                         connections[conn_id].other_rank,
+                         remote_segment_buf.get_segment_id(),
+                         remote_segment_buf.get_offset(),
+                         local_segment_buf.get_size(),
+                         connections[conn_id].notifications[micro_id],
+                         micro_id + 1, // to check micro_id at recv (must not be zero)
+                         resource_manager.get_queue_id_for_write_notify(),
+                         GASPI_BLOCK),
+      "PipelineCommunicator::non_blocking_send");
+  }
+
+  void PipelineCommunicator::blocking_recv(void* local_recv_buf,
+                                           ConnectionID conn_id,
+                                           MicrobatchID micro_id)
+  {
+    auto const& local_segment_buf = connections[conn_id].recv_bufs[micro_id];
+    gaspi_notification_id_t received_notification_id = 0;
+    gaspi_notification_t received_notification_value = 0;
+
+    GPI::gaspiCheckReturn(
+      gaspi_notify_waitsome(local_segment_buf.get_segment_id(),
+                            connections[conn_id].notifications[micro_id],
+                            1,
+                            &received_notification_id,
+                            GASPI_BLOCK),
+      "PipelineCommunicator::blocking_recv : gaspi_notify_waitsome");
+    GPI::gaspiCheckReturn(
+      gaspi_notify_reset(local_segment_buf.get_segment_id(),
+                         received_notification_id,
+                         &received_notification_value),
+      "PipelineCommunicator::blocking_recv : gaspi_notify_reset");
+    if (received_notification_value != micro_id + 1)
+    {
+      throw std::runtime_error("PipelineCommunicator::blocking_recv : \
+                                Incorrect notification value received");
+    }
+
+    copy_data_from_segment(local_recv_buf, local_segment_buf);
+  }
+
+  void PipelineCommunicator::copy_data_to_segment(void* local_send_buf,
+                                                  GPI::SegmentBuffer const& segment_buffer)
+  {
+    auto const segment_ptr = segment_buffer.get_ptr();
+    auto const buffer_size = segment_buffer.get_size();
+    std::memcpy(segment_ptr, local_send_buf, buffer_size);
+  }
+
+  void PipelineCommunicator::copy_data_from_segment(void* local_recv_buf,
+                                                    GPI::SegmentBuffer const& segment_buffer)
+  {
+    auto const segment_ptr = segment_buffer.get_ptr();
+    auto const buffer_size = segment_buffer.get_size();
+    std::memcpy(local_recv_buf, segment_ptr, buffer_size);
+  }
+}
diff --git a/src/gpi_comm_lib/PipelineCommunicator.hpp b/src/gpi_comm_lib/PipelineCommunicator.hpp
new file mode 100644
index 00000000..3c6effaa
--- /dev/null
+++ b/src/gpi_comm_lib/PipelineCommunicator.hpp
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <gpi/Context.hpp>
+#include <gpi/ResourceManager.hpp>
+#include <gpi/SegmentBuffer.hpp>
+
+#include <unordered_map>
+#include <utility>
+
+namespace tarantella
+{
+  class SendRecvResources
+  {
+    public:
+      SendRecvResources() = default;
+      SendRecvResources(GPI::Rank rank,
+                        std::vector<GPI::SegmentBuffer> const& send_bufs,
+                        std::vector<GPI::SegmentBuffer> const& recv_bufs,
+                        std::vector<GPI::NotificationManager::NotificationID> const& notifications)
+      : other_rank(rank), send_bufs(send_bufs), recv_bufs(recv_bufs), notifications(notifications)
+      {}
+
+      GPI::Rank other_rank;
+      std::vector<GPI::SegmentBuffer> send_bufs;
+      std::vector<GPI::SegmentBuffer> recv_bufs;
+      std::vector<GPI::NotificationManager::NotificationID> notifications;
+  };
+
+  class ConnectionInfo
+  {
+    public:
+      explicit ConnectionInfo(GPI::SegmentID segment_id, GPI::Rank other_rank, std::size_t buffer_size_bytes)
+      : segment_id(segment_id), other_rank(other_rank), microbatched_buffer_size_bytes(buffer_size_bytes)
+      {}
+
+      GPI::SegmentID segment_id;
+      GPI::Rank other_rank;
+      std::size_t microbatched_buffer_size_bytes;
+  };
+
+  class PipelineCommunicator
+  {
+    public:
+      using ConnectionID = std::size_t;
+      using MicrobatchID = std::size_t;
+
+      PipelineCommunicator(GPI::Context&,
+                           std::unordered_map<ConnectionID, ConnectionInfo> const&,
+                           std::size_t num_micro_batches);
+
+      void non_blocking_send(void* local_send_buf,
+                             ConnectionID,
+                             MicrobatchID);
+      void blocking_recv(void* local_recv_buf,
+                         ConnectionID,
+                         MicrobatchID);
+
+    private:
+      GPI::ResourceManager& resource_manager;
+      std::unordered_map<ConnectionID, SendRecvResources> connections;
+
+      void copy_data_to_segment(void* local_send_buf, GPI::SegmentBuffer const&);
+      void copy_data_from_segment(void* local_recv_buf, GPI::SegmentBuffer const&);
+  };
+}
diff --git a/src/gpi_comm_lib/SynchCommunicator.cpp b/src/gpi_comm_lib/SynchCommunicator.cpp
new file mode 100644
index 00000000..163635b5
--- /dev/null
+++ b/src/gpi_comm_lib/SynchCommunicator.cpp
@@ -0,0 +1,166 @@
+#include "SynchCommunicator.hpp"
+#include "collectives/allreduce/RecursiveHalvingDoubleBuffer.hpp"
+
+#include <cstring>
+#include <memory>
+#include <utility>
+
+namespace tarantella
+{
+  void SynchCommunicator::create_fused_tensor_infos_and_ids(
+    std::vector<collectives::TensorInfo> const& tensor_infos,
+    std::size_t threshold_bytes)
+  {
+    collectives::TensorFusor fusor {threshold_bytes};
+    fusor.fuse_tensor_infos_and_ids(tensor_infos, fused_ids, fused_tensor_infos);
+  }
+
+  void SynchCommunicator::create_fused_tensors_synchronization()
+  {
+    for(auto const& fused_info : fused_tensor_infos)
+    {
+      auto const fused_id = fused_info.first;
+      ready_to_start_counters[fused_id] = std::make_unique<std::atomic<std::size_t>>(0UL);
+      finished_counters[fused_id] = std::make_unique<std::atomic<std::size_t>>(0UL);
+      ready_to_copy_back[fused_id] = std::make_unique<std::atomic<bool>>(false);
+      ready_to_reset_counters[fused_id] = std::make_unique<std::atomic<std::size_t>>(0UL);
+    }
+  }
+
+  SynchCommunicator::SynchCommunicator(GPI::Context& context,
+                                       GPI::SegmentID segment_id,
+                                       GPI::Group const& group,
+                                       std::vector<collectives::TensorInfo> const& tensor_infos,
+                                       std::size_t threshold_for_tensor_fusion_bytes)
+  : resource_manager(context.get_resource_manager()),
+    segment_id(segment_id),
+    group(group),
+    queue_handler(),
+    fused_ids(),
+    fused_tensor_infos(),
+    operators(),
+    ready_to_start_counters(),
+    finished_counters(),
+    ready_to_copy_back(),
+    ready_to_reset_counters(),
+    setup_has_finished(),
+    terminate_man_thread(false),
+    management_thread(&tarantella::SynchCommunicator::management_thread_task, this)
+  {
+    using AllreduceImplementation = collectives::Allreduce::RecursiveHalvingDoubleBuffer;
+    create_fused_tensor_infos_and_ids(tensor_infos, threshold_for_tensor_fusion_bytes);
+    create_fused_tensors_synchronization();
+    create_segment_resources<AllreduceImplementation>(tensor_infos);
+    create_operators_with_state<AllreduceImplementation>();
+    setup_has_finished.notify();
+  }
+
+  SynchCommunicator::SynchCommunicator(GPI::Context& context,
+                                       GPI::SegmentID segment_id,
+                                       GPI::Group const& group,
+                                       std::vector<collectives::TensorInfo> const& tensor_infos)
+  : SynchCommunicator(context, segment_id, group, tensor_infos, 0UL)
+  { }
+
+  SynchCommunicator::~SynchCommunicator()
+  {
+    terminate_man_thread = true;
+    if (management_thread.joinable())
+    {
+      management_thread.join();
+    }
+  }
+
+  void SynchCommunicator::start_allreduce_impl(GradID const& grad_id, const void* data_ptr)
+  {
+    auto const fused_id = fused_ids[grad_id];
+
+    // All `grad_id`s copy-in their respective data
+    copy_data_to_segment(grad_id, data_ptr);
+    auto const value = ready_to_start_counters[fused_id]->fetch_add(1UL);
+
+    // Make sure all copies are done, before last `grad_id` starts operator
+    if (value == fused_tensor_infos[fused_id].get_num_tensors()-1)
+    {
+      operators[fused_id].allreduce->start();
+      ready_to_start_counters[fused_id]->store(0UL);
+    }
+  }
+
+  void SynchCommunicator::finish_allreduce_impl(GradID const& grad_id, void* results_ptr)
+  {
+    auto const fused_id = fused_ids[grad_id];
+
+    // First `grad_id` to arrive waits for `has_finished`, and notifies
+    // everyone that results can be copied back
+    auto const num_arrived = finished_counters[fused_id]->fetch_add(1UL);
+    if (num_arrived == 0)
+    {
+      operators[fused_id].has_finished->wait();
+      ready_to_copy_back[fused_id]->store(true);
+    }
+
+    // All `grad_id`s copy-out their respective data,
+    // once results have been obtained
+    while(true)
+    {
+      if(ready_to_copy_back[fused_id]->load())
+      {
+        copy_data_from_segment(grad_id, results_ptr);
+        break;
+      }
+    }
+
+    // Make sure all copies are done, before last `grad_id` resets initial state
+    auto const copied_grads = ready_to_reset_counters[fused_id]->fetch_add(1UL);
+    if (copied_grads == fused_tensor_infos[fused_id].get_num_tensors()-1)
+    {
+      operators[fused_id].allreduce->reset_for_reuse();
+      finished_counters[fused_id]->store(0UL);
+      ready_to_copy_back[fused_id]->store(false);
+      ready_to_reset_counters[fused_id]->store(0UL);
+    }
+  }
+
+  void SynchCommunicator::copy_data_to_segment(GradID const& grad_id, const void* data_ptr)
+  {
+    auto const fused_id = fused_ids[grad_id];
+    auto const segment_ptr = reinterpret_cast<char*>(operators[fused_id].allreduce->get_input_ptr())
+                             + fused_tensor_infos[fused_id].get_local_offset_bytes(grad_id);
+    std::memcpy(segment_ptr, data_ptr, fused_tensor_infos[fused_id].get_local_size_bytes(grad_id));
+  }
+
+  void SynchCommunicator::copy_data_from_segment(GradID const& grad_id, void* results_ptr)
+  {
+    auto const fused_id = fused_ids[grad_id];
+    auto const segment_ptr = reinterpret_cast<char*>( operators[fused_id].allreduce->get_result_ptr())
+                             + fused_tensor_infos[fused_id].get_local_offset_bytes(grad_id);
+    std::memcpy(results_ptr, segment_ptr, fused_tensor_infos[fused_id].get_local_size_bytes(grad_id));
+  }
+
+  void SynchCommunicator::management_thread_task()
+  {
+    setup_has_finished.wait();
+    while (!terminate_man_thread)
+    {
+      while (true)
+      {
+        if (terminate_man_thread)
+        {
+          break;
+        }
+        for (auto& element : operators)
+        {
+          auto& op = *(element.second.allreduce.get());
+          if (op.is_finished()) continue;
+
+          op.trigger_communication_step();
+          if (op.is_finished())
+          {
+            element.second.has_finished->notify();
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/SynchCommunicator.hpp b/src/gpi_comm_lib/SynchCommunicator.hpp
new file mode 100644
index 00000000..db0a2fc7
--- /dev/null
+++ b/src/gpi_comm_lib/SynchCommunicator.hpp
@@ -0,0 +1,134 @@
+#pragma once
+
+#include "AtomicCondition.hpp"
+#include "collectives/allreduce/Operator.hpp"
+#include "collectives/barrier/GPIBarrier.hpp"
+#include "collectives/FusedTensorInfo.hpp"
+#include "collectives/TensorInfo.hpp"
+#include "collectives/Types.hpp"
+#include "distribution/utilities.hpp"
+#include "gpi/Context.hpp"
+#include "gpi/ResourceManager.hpp"
+#include "queues.h"
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <thread>
+#include <typeindex>
+#include <unordered_map>
+#include <vector>
+
+namespace tarantella
+{
+  using GradID = collectives::GradID;
+  using FusedID = collectives::FusedID;
+
+  class SynchCommunicator
+  {
+    public:
+      SynchCommunicator(GPI::Context&, GPI::SegmentID, GPI::Group const&, std::vector<collectives::TensorInfo> const&);
+      SynchCommunicator(GPI::Context&, GPI::SegmentID, GPI::Group const&, std::vector<collectives::TensorInfo> const&, std::size_t);
+      SynchCommunicator(SynchCommunicator&) = delete;
+      SynchCommunicator& operator=(SynchCommunicator&) = delete;
+      ~SynchCommunicator();
+
+      // TODO: Replace void* with a LocalBuffer struct {ptr, size}
+      void start_allreduce_impl(GradID const&, const void*);
+      void finish_allreduce_impl(GradID const&, void*);
+
+    private:
+      struct OperatorWithState
+      {
+        std::unique_ptr<collectives::Allreduce::Operator> allreduce;
+        std::unique_ptr<AtomicCondition> has_finished;
+      };
+
+      static collectives::Allreduce::Operator::ReductionOp const reduction_op = collectives::Allreduce::Operator::ReductionOp::AVERAGE;
+
+      GPI::ResourceManager& resource_manager;
+      GPI::SegmentID segment_id;
+      GPI::Group const& group;
+      collectives::queues queue_handler; // TODO replace with the ResourceManager
+
+      std::unordered_map<GradID, FusedID> fused_ids;
+      std::unordered_map<FusedID, collectives::FusedTensorInfo> fused_tensor_infos;
+      std::unordered_map<FusedID, OperatorWithState> operators;
+
+      std::unordered_map<FusedID, std::unique_ptr<std::atomic<std::size_t>>> ready_to_start_counters;
+      std::unordered_map<FusedID, std::unique_ptr<std::atomic<std::size_t>>> finished_counters;
+      std::unordered_map<FusedID, std::unique_ptr<std::atomic<bool>>> ready_to_copy_back;
+      std::unordered_map<FusedID, std::unique_ptr<std::atomic<std::size_t>>> ready_to_reset_counters;
+
+      AtomicCondition setup_has_finished;
+      std::atomic<bool> terminate_man_thread;
+      std::thread management_thread;
+      void management_thread_task();
+
+      void copy_data_to_segment(GradID const&, const void*);
+      void copy_data_from_segment(GradID const&, void*);
+      
+      void create_fused_tensor_infos_and_ids(std::vector<collectives::TensorInfo> const&, std::size_t);
+      void create_fused_tensors_synchronization();
+
+      template <typename AllreduceAlgorithm>
+      constexpr float get_overhead_factor() const;
+
+      template <typename AllreduceAlgorithm>
+      void create_segment_resources(std::vector<collectives::TensorInfo> const& tensor_infos) const;
+
+      void create_fused_tensor_infos(std::vector<collectives::TensorInfo> const &tensor_infos);
+
+      template <typename AllreduceAlgorithm>
+      std::unique_ptr<collectives::Allreduce::Operator> create_allreduce_op(collectives::TensorInfo const&);
+
+      template <typename AllreduceAlgorithm>
+      void create_operators_with_state();
+  };
+
+  template <typename AllreduceAlgorithm>
+  constexpr float SynchCommunicator::get_overhead_factor() const
+  {
+    return 3.5;
+  }
+
+  template <typename AllreduceAlgorithm>
+  void SynchCommunicator::create_segment_resources(std::vector<collectives::TensorInfo> const& tensor_infos) const
+  {
+    auto const segment_size = distribution::get_segment_size(tensor_infos, get_overhead_factor<AllreduceAlgorithm>());
+    resource_manager.make_segment_resources(segment_id, group, segment_size);
+
+    // Barrier is required, to ensure all ranks have finished registering
+    // their segments to their communication partners
+    collectives::Barrier::GPIBarrier barrier(group);
+    barrier.blocking_barrier();
+  }
+
+  template <typename AllreduceAlgorithm>
+  std::unique_ptr<collectives::Allreduce::Operator> SynchCommunicator::create_allreduce_op(collectives::TensorInfo const& tensor_info)
+  {
+    auto const required_resources = AllreduceAlgorithm::get_required_resources(tensor_info, group);
+
+    collectives::Allreduce::Operator::ResourceList resources;
+    for (auto const& resource : required_resources)
+    {
+      resources.emplace_back(
+          resource_manager.get_buffer_of_size(segment_id, resource.buffer_size),
+          resource_manager.get_notification_range(segment_id, resource.num_notifications));
+    }
+
+    return std::make_unique<AllreduceAlgorithm>(tensor_info, reduction_op, resources, queue_handler, group);
+  }
+
+  template <typename AllreduceAlgorithm>
+  void SynchCommunicator::create_operators_with_state()
+  {
+    for(auto const& fused_info : fused_tensor_infos)
+    {
+      auto const tensor_id = fused_info.first;
+      auto const tensor_info = fused_info.second.to_tensor_info();
+      OperatorWithState op{create_allreduce_op<AllreduceAlgorithm>(tensor_info), std::make_unique<AtomicCondition>()};
+      operators.emplace(tensor_id, std::move(op));
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/TensorBroadcaster.cpp b/src/gpi_comm_lib/TensorBroadcaster.cpp
new file mode 100644
index 00000000..f28cc19b
--- /dev/null
+++ b/src/gpi_comm_lib/TensorBroadcaster.cpp
@@ -0,0 +1,84 @@
+#include "TensorBroadcaster.hpp"
+
+#include "distribution/utilities.hpp"
+#include "gpi/Context.hpp"
+#include "gpi/ResourceManager.hpp"
+#include "gpi/SegmentBuffer.hpp"
+
+#include <cstring>
+#include <vector>
+
+namespace tarantella
+{
+  TensorBroadcaster::TensorBroadcaster(GPI::Context& context,
+                                       GPI::SegmentID segment_id,
+                                       GPI::Group const& group,
+                                       std::vector<collectives::TensorInfo> const& tensor_infos,
+                                       GPI::Rank root_rank)
+  : context(context),
+    group(group),
+    queue_handler(),
+    root(root_rank),
+    barrier(group)
+  {
+    if(!group.contains_rank(root_rank))
+    {
+      throw std::runtime_error("[TensorBroadcaster::constructor]:\
+                                Incorrect root_rank is not part of the broadcast group");
+    }
+
+    auto const overhead_factor = 1.0;
+    auto& resource_manager = context.get_resource_manager();
+    auto const segment_size = distribution::get_segment_size(tensor_infos, overhead_factor);
+
+    resource_manager.make_segment_resources(segment_id, group, segment_size);
+
+    // Barrier is required, to ensure all ranks have finished registering
+    // their segments to their communication partners
+    barrier.blocking_barrier();
+
+    for(auto const& info : tensor_infos)
+    {
+      auto const size_in_bytes = info.get_nelems() * getDataTypeSize(info.get_elem_type());
+      buffers.emplace_back(resource_manager.get_buffer_of_size(segment_id, size_in_bytes));
+    }
+
+    auto const notifications = resource_manager.get_notification_range(segment_id,
+                                                                       collectives::broadcast::getNumberOfNotifications(group.get_size()));
+    bcast_op = std::make_unique<collectives::broadcast>(root, segment_size, segment_id, buffers.front().get_offset(),
+                                                        notifications.first, queue_handler);
+  }
+
+  void TensorBroadcaster::exec_broadcast(std::vector<void*> const& data_ptrs)
+  {
+    // copy data to segments
+    if (context.get_rank() == root)
+    {
+      for (std::size_t i = 0; i < data_ptrs.size(); ++i)
+      {
+        std::memcpy(buffers[i].get_ptr(), data_ptrs[i], buffers[i].get_size());
+      }
+    }
+
+    // start the operation
+    if (context.get_rank() == root)
+    {
+      bcast_op->signal();
+    }
+    // execute broadcast
+    while(bcast_op->operator()() != 0);
+
+    // copy results back to buffers
+    if (context.get_rank() != root)
+    {
+      for (std::size_t i = 0; i < data_ptrs.size(); ++i)
+      {
+        std::memcpy(data_ptrs[i], buffers[i].get_ptr(), buffers[i].get_size());
+      }
+    }
+
+    // finalize operation
+    barrier.blocking_barrier();
+  }
+}
+
diff --git a/src/gpi_comm_lib/TensorBroadcaster.hpp b/src/gpi_comm_lib/TensorBroadcaster.hpp
new file mode 100644
index 00000000..c72440ce
--- /dev/null
+++ b/src/gpi_comm_lib/TensorBroadcaster.hpp
@@ -0,0 +1,33 @@
+#pragma once
+
+#include "collectives/barrier/GPIBarrier.hpp"
+#include "collectives/TensorInfo.hpp"
+#include "gpi/Context.hpp"
+#include "gpi/Group.hpp"
+#include "gpi/SegmentBuffer.hpp"
+#include "broadcast.h"
+
+#include <memory>
+#include <vector>
+
+namespace tarantella
+{
+
+  class TensorBroadcaster
+  {
+    public:
+      TensorBroadcaster(GPI::Context&, GPI::SegmentID, GPI::Group const&,
+                        std::vector<collectives::TensorInfo> const&, GPI::Rank root_rank);
+      void exec_broadcast(std::vector<void*> const&);
+
+    private:
+      GPI::Context& context;
+      GPI::Group const group;
+      collectives::queues queue_handler; // FIXME: use GPI::ResourcesManager
+      GPI::Rank root;
+      collectives::Barrier::GPIBarrier barrier;
+
+      std::vector<GPI::SegmentBuffer> buffers;
+      std::unique_ptr<collectives::broadcast> bcast_op;
+  };
+}
diff --git a/src/gpi_comm_lib/collectives/BufferElementType.cpp b/src/gpi_comm_lib/collectives/BufferElementType.cpp
new file mode 100644
index 00000000..ac0b457e
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/BufferElementType.cpp
@@ -0,0 +1,26 @@
+#include "BufferElementType.hpp"
+
+#include <unordered_map>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    std::size_t getDataTypeSize(const BufferElementType d)
+    {
+      std::unordered_map<BufferElementType, unsigned int> const sizes
+      {
+        {BufferElementType::FLOAT, sizeof(float)},
+        {BufferElementType::DOUBLE, sizeof(double)},
+        {BufferElementType::INT16, sizeof(int16_t)},
+        {BufferElementType::INT32, sizeof(int32_t)}
+      };
+      return sizes.at(d);
+    }
+
+    std::ostream &operator<<(std::ostream& os, BufferElementType const& elem_type)
+    {
+      return os << static_cast<int>(elem_type);
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/collectives/BufferElementType.hpp b/src/gpi_comm_lib/collectives/BufferElementType.hpp
new file mode 100644
index 00000000..846dda2c
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/BufferElementType.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <cstddef>
+#include <iostream>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    enum class BufferElementType
+    {
+      FLOAT,
+      DOUBLE,
+      INT16,
+      INT32
+    };
+
+    std::size_t getDataTypeSize(const BufferElementType d);
+    std::ostream &operator<<(std::ostream& os, BufferElementType const& elem_type);
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/CMakeLists.txt b/src/gpi_comm_lib/collectives/CMakeLists.txt
new file mode 100644
index 00000000..9ed5643d
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/CMakeLists.txt
@@ -0,0 +1,41 @@
+
+set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+
+set(COLLECTIVES_SRC_DIR ${SRC_DIR}/gpi_comm_lib/collectives)
+set(libSources
+    ${COLLECTIVES_SRC_DIR}/lib/allreduceButterfly.cpp
+    ${COLLECTIVES_SRC_DIR}/lib/allreduceButterflyDoubleBuffer.cpp
+    ${COLLECTIVES_SRC_DIR}/lib/broadcast.cpp
+    ${COLLECTIVES_SRC_DIR}/lib/counter.cpp
+    ${COLLECTIVES_SRC_DIR}/lib/mailBoxGaspi.cpp
+    ${COLLECTIVES_SRC_DIR}/lib/mailBoxLocal.cpp
+    ${COLLECTIVES_SRC_DIR}/lib/queues.cpp
+    ${COLLECTIVES_SRC_DIR}/lib/reduce.cpp
+    ${COLLECTIVES_SRC_DIR}/lib/writer.cpp
+    ${COLLECTIVES_SRC_DIR}/allreduce/RecursiveHalving.cpp
+    ${COLLECTIVES_SRC_DIR}/allreduce/RecursiveHalvingDoubleBuffer.cpp
+    ${COLLECTIVES_SRC_DIR}/allreduce/utils.cpp
+    ${COLLECTIVES_SRC_DIR}/barrier/GPIBarrier.cpp
+    ${COLLECTIVES_SRC_DIR}/BufferElementType.cpp
+    ${COLLECTIVES_SRC_DIR}/FusedTensorInfo.cpp
+    ${COLLECTIVES_SRC_DIR}/TensorInfo.cpp
+)
+
+extended_add_library(NAME collectives
+            NAMESPACE tnt
+            TYPE SHARED
+            SOURCES
+                ${libSources}
+            LIBRARIES
+                optimized GPI2::GPI2
+                debug GPI2::GPI2dbg
+                tnt::gpiresources
+            INCLUDE_DIRECTORIES
+                ${COLLECTIVES_SRC_DIR}/lib/
+            COMPILE_OPTIONS
+                -Wno-unused-private-field
+            INSTALL
+            INSTALL_DESTINATION
+                ${INSTALL_LIB_DIR}
+            POSITION_INDEPENDENT)
+
diff --git a/src/gpi_comm_lib/collectives/FusedTensorInfo.cpp b/src/gpi_comm_lib/collectives/FusedTensorInfo.cpp
new file mode 100644
index 00000000..f0ceed6b
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/FusedTensorInfo.cpp
@@ -0,0 +1,185 @@
+#include "FusedTensorInfo.hpp"
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    void FusedTensorInfo::initialise_from_tensor_info(TensorInfo const& tensor_info)
+    {
+      local_offset_bytes.clear();
+      local_size_bytes.clear();
+
+      id = tensor_info.get_id();
+      nelems = tensor_info.get_nelems();
+      elem_type = tensor_info.get_elem_type();
+      elem_size = getDataTypeSize(elem_type);
+      size_bytes = nelems * elem_size;
+      num_tensors = 1UL;
+      tensor_ids.push_back(id);
+      local_offset_bytes[id] = 0UL;
+      local_size_bytes[id] = size_bytes;
+    }
+
+    FusedTensorInfo::FusedTensorInfo()
+    : id(),
+      nelems(),
+      elem_type(),
+      elem_size(),
+      size_bytes(),
+      num_tensors(),
+      tensor_ids(),
+      local_offset_bytes(),
+      local_size_bytes()
+    { }
+
+    FusedTensorInfo::FusedTensorInfo(TensorInfo const& tensor_info)
+    : FusedTensorInfo()
+    {
+      initialise_from_tensor_info(tensor_info);
+    }
+
+    FusedTensorInfo& FusedTensorInfo::operator=(TensorInfo const& tensor_info)
+    {
+      initialise_from_tensor_info(tensor_info);
+      return *this;
+    }
+
+    bool FusedTensorInfo::operator==(FusedTensorInfo const& other) const
+    {
+      return ( this->id == other.id &&
+               this->nelems == other.nelems &&
+               this->elem_type == other.elem_type &&
+               this->num_tensors == other.num_tensors &&
+               this->local_offset_bytes == other.local_offset_bytes &&
+               this->local_size_bytes == other.local_size_bytes );
+
+    }
+
+    FusedID FusedTensorInfo::get_id() const
+    {
+      return id;
+    }
+
+    std::size_t FusedTensorInfo::get_nelems() const
+    {
+      return nelems;
+    }
+
+    BufferElementType FusedTensorInfo::get_elem_type() const
+    {
+      return elem_type;
+    }
+
+    std::size_t FusedTensorInfo::get_size_bytes() const
+    {
+      return size_bytes;
+    }
+
+    std::size_t FusedTensorInfo::get_num_tensors() const
+    {
+      return num_tensors;
+    }
+
+    std::vector<GradID> FusedTensorInfo::get_tensor_ids() const
+    {
+      return tensor_ids;
+    }
+
+    std::size_t FusedTensorInfo::get_local_offset_bytes(GradID const& grad_id) const
+    {
+      auto const it = local_offset_bytes.find(grad_id);
+      if (it == local_offset_bytes.end())
+      {
+        throw std::logic_error("FusedTensorInfo::get_local_offset_bytes: FusedTensorInfo does not contain GradID");
+      }
+      return it->second;
+    }
+
+    std::size_t FusedTensorInfo::get_local_size_bytes(GradID const& grad_id) const
+    {
+      auto const it = local_size_bytes.find(grad_id);
+      if (it == local_size_bytes.end())
+      {
+        throw std::logic_error("FusedTensorInfo::get_local_size_bytes: FusedTensorInfo does not contain GradID");
+      }
+      return it->second;
+    }
+
+    void FusedTensorInfo::add_tensor_info(TensorInfo const& tensor_info)
+    {
+      if (tensor_info.get_elem_type() != get_elem_type())
+      {
+        throw std::logic_error("FusedTensorInfo::add_tensor_info: Tensors need to have same data type");
+      }
+
+      auto const grad_id = tensor_info.get_id();
+      auto const grad_nelems = tensor_info.get_nelems();
+      auto const grad_size_bytes = grad_nelems * elem_size;
+      auto const current_offset = size_bytes;
+
+      nelems += grad_nelems;
+      size_bytes += grad_size_bytes;
+      num_tensors += 1UL;
+
+      tensor_ids.push_back(grad_id);
+      local_offset_bytes[grad_id] = current_offset;
+      local_size_bytes[grad_id] = grad_size_bytes;
+    }
+
+    TensorInfo FusedTensorInfo::to_tensor_info() const
+    {
+      return {get_id(), get_nelems(), get_elem_type()};
+    }
+
+    TensorFusor::TensorFusor()
+    : threshold_bytes(0UL)
+    { }
+
+    TensorFusor::TensorFusor(std::size_t threshold)
+    : threshold_bytes(threshold)
+    { }
+
+    void TensorFusor::fuse_tensor_infos_and_ids(std::vector<TensorInfo> const& tensor_infos,
+                                                IDMap& fused_ids,
+                                                InfoMap& fused_tensor_infos)
+    {
+      if (tensor_infos.size() == 1)
+      {
+        auto const tensor_info = tensor_infos.front();
+        auto const id = tensor_info.get_id();
+        fused_ids[id] = id;
+        fused_tensor_infos[id] = tensor_info;
+      }
+
+      collectives::FusedTensorInfo fused_info(tensor_infos.front());
+      auto tensor_id = tensor_infos.front().get_id();
+      FusedID fused_id(tensor_id);
+      fused_ids[tensor_id] = fused_id;
+
+      for (auto idx = 1UL; idx < tensor_infos.size(); ++idx)
+      {
+        tensor_id = tensor_infos[idx].get_id();
+
+        if (fused_info.get_size_bytes() < threshold_bytes)
+        {
+          fused_info.add_tensor_info(tensor_infos[idx]);
+        }
+        else
+        {
+          fused_tensor_infos[fused_id] = fused_info;
+          fused_id = tensor_id;
+          fused_info = tensor_infos[idx];
+        }
+
+        fused_ids[tensor_id] = fused_id;
+
+        // Always add the last fused_tensor to the vector.
+        // Note, that it might still be smaller than `threshold_bytes`.
+        if (idx == tensor_infos.size() - 1)
+        {
+          fused_tensor_infos[fused_id] = fused_info;
+        }
+      }
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/FusedTensorInfo.hpp b/src/gpi_comm_lib/collectives/FusedTensorInfo.hpp
new file mode 100644
index 00000000..92925d2a
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/FusedTensorInfo.hpp
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "BufferElementType.hpp"
+#include "TensorInfo.hpp"
+#include "Types.hpp"
+
+#include <cstddef>
+#include <unordered_map>
+#include <vector>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    class FusedTensorInfo
+    {
+      public:
+        FusedTensorInfo();
+        FusedTensorInfo(TensorInfo const&);
+        FusedTensorInfo& operator=(TensorInfo const&);
+        bool operator==(FusedTensorInfo const&) const;
+
+        FusedID get_id() const;
+        std::size_t get_nelems() const;
+        BufferElementType get_elem_type() const;
+        std::size_t get_size_bytes() const;
+
+        std::size_t get_num_tensors() const;
+        std::vector<GradID> get_tensor_ids() const;
+
+        std::size_t get_local_offset_bytes(GradID const&) const;
+        std::size_t get_local_size_bytes(GradID const&) const;
+      
+        void add_tensor_info(TensorInfo const&);
+        TensorInfo to_tensor_info() const;
+
+      private:
+        FusedID id;
+        std::size_t nelems;
+        BufferElementType elem_type;
+        std::size_t elem_size;
+        std::size_t size_bytes;
+        std::size_t num_tensors;
+
+        std::vector<GradID> tensor_ids;
+        std::unordered_map<GradID, std::size_t> local_offset_bytes;
+        std::unordered_map<GradID, std::size_t> local_size_bytes;
+
+        void initialise_from_tensor_info(TensorInfo const&);
+    };
+
+    class TensorFusor
+    {
+      public:
+        using IDMap = std::unordered_map<GradID, FusedID>;
+        using InfoMap = std::unordered_map<FusedID, collectives::FusedTensorInfo>;
+
+        TensorFusor();
+        TensorFusor(std::size_t threshold);
+
+        void fuse_tensor_infos_and_ids(std::vector<TensorInfo> const&,
+                                       IDMap&,
+                                       InfoMap&);
+
+      private:
+        std::size_t threshold_bytes;
+    };
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/TensorInfo.cpp b/src/gpi_comm_lib/collectives/TensorInfo.cpp
new file mode 100644
index 00000000..b8b9d345
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/TensorInfo.cpp
@@ -0,0 +1,26 @@
+#include "TensorInfo.hpp"
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    TensorInfo::TensorInfo(GradID tensid, std::size_t nelems, BufferElementType elem_type)
+    : id(tensid), nelems(nelems), elem_type(elem_type)
+    {}
+
+    GradID TensorInfo::get_id() const
+    {
+      return id;
+    }
+
+    std::size_t TensorInfo::get_nelems() const
+    {
+      return nelems;
+    }
+
+    BufferElementType TensorInfo::get_elem_type() const
+    {
+      return elem_type;
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/TensorInfo.hpp b/src/gpi_comm_lib/collectives/TensorInfo.hpp
new file mode 100644
index 00000000..374ce0dc
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/TensorInfo.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "BufferElementType.hpp"
+#include "Types.hpp"
+
+#include <cstddef>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    class TensorInfo
+    {
+      public:
+        TensorInfo(GradID tensid, std::size_t nelems, BufferElementType elem_type);
+
+        GradID get_id() const;
+        std::size_t get_nelems() const;
+        BufferElementType get_elem_type() const;
+      
+      private:
+        const GradID id;
+        const std::size_t nelems;
+        const BufferElementType elem_type;
+    };
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/Types.hpp b/src/gpi_comm_lib/collectives/Types.hpp
new file mode 100644
index 00000000..2efe0caa
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/Types.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    using GradID = std::size_t;
+    using FusedID = std::size_t;
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/allreduce/Operator.hpp b/src/gpi_comm_lib/collectives/allreduce/Operator.hpp
new file mode 100644
index 00000000..e268721c
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/allreduce/Operator.hpp
@@ -0,0 +1,71 @@
+#pragma once
+
+#include "collectives/BufferElementType.hpp"
+#include "gpi/NotificationManager.hpp"
+#include "gpi/SegmentBuffer.hpp"
+
+#include <cstddef>
+#include <vector>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    namespace Allreduce
+    {
+      // \note
+      // Interface for non-blocking, asynchronous Allreduce algorithms (not thread-safe)
+      class Operator
+      {
+        public:
+          class RequiredResource
+          {
+            public:
+              std::size_t buffer_size;
+              std::size_t num_notifications;
+          };
+          using RequiredResourceList = std::vector<RequiredResource>;
+          using Resource = std::pair<GPI::SegmentBuffer, GPI::NotificationManager::NotificationRange>;
+          using ResourceList = std::vector<Resource>;
+  
+          enum class ReductionOp
+          {
+            SUM,
+            AVERAGE
+          };
+  
+          enum class OperatorState
+          {
+            NOT_STARTED,
+            RUNNING,
+            FINISHED
+          };
+  
+          virtual ~Operator() = default;
+  
+          // Initiates the Allreduce operation (non-blocking)
+          // and sets is_running == TRUE
+          virtual void start() = 0;
+  
+          // Makes partial progress towards computing the Allreduce result
+          // and has to be called multiple times until the operation is completed,
+          // when is_finished == TRUE
+          // can be called independently of the state;
+          // it only tries to make progress if is_running == TRUE
+          virtual void trigger_communication_step() = 0;
+  
+          // Enables the Allreduce to be started again
+          // and sets is_running == FALSE and is_finished == FALSE
+          virtual void reset_for_reuse() = 0;
+          virtual bool is_running() const = 0;
+  
+          // If TRUE, results are available until reset_for_reuse() is called
+          virtual bool is_finished() const = 0;
+  
+          // TODO: void* -> SegmentBuffer
+          virtual void* get_input_ptr() const = 0;
+          virtual void* get_result_ptr() const = 0;
+      };
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/allreduce/RecursiveHalving.cpp b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalving.cpp
new file mode 100644
index 00000000..be62d494
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalving.cpp
@@ -0,0 +1,98 @@
+#include "RecursiveHalving.hpp"
+
+#include "utils.hpp"
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    namespace Allreduce
+    {
+      RecursiveHalving::RecursiveHalving(TensorInfo tensor_info,
+                                         ReductionOp reduction_op,
+                                         ResourceList const &resource_list,
+                                         queues &queues,
+                                         GPI::Group const &group)
+          : group(group),
+            state(OperatorState::NOT_STARTED),
+            allreduce(tensor_info.get_nelems(), to_allreduce_dataType(tensor_info.get_elem_type()),
+                      to_allreduce_reductionType(reduction_op),
+                      to_allreduce_segment_buffer(resource_list.at(0)),
+                      to_allreduce_segment_buffer(resource_list.at(1)),
+                      queues, group),
+            barrier(group)
+      {}
+  
+      void RecursiveHalving::start()
+      {
+        if (is_running())
+        {
+          throw std::logic_error("[RecursiveHalving::start] Operation already started.");
+        }
+        if (is_finished())
+        {
+          throw std::logic_error("[RecursiveHalving::start] Operation not reset after finish.");
+        }
+        allreduce.signal();
+        state = OperatorState::RUNNING;
+      }
+  
+      void RecursiveHalving::trigger_communication_step()
+      {
+        if (is_running())
+        {
+          auto const result = allreduce();
+          if (result == 0)
+          {
+            barrier.blocking_barrier();
+            state = OperatorState::FINISHED;
+          }
+        }
+        else
+        {
+          // do nothing before start() is called
+        }
+      }
+  
+      void RecursiveHalving::reset_for_reuse()
+      {
+        if (is_running())
+        {
+          throw std::logic_error("[RecursiveHalving::reset] Cannot reset while running.");
+        }
+        state = OperatorState::NOT_STARTED;
+      }
+  
+      bool RecursiveHalving::is_running() const
+      {
+        return state == OperatorState::RUNNING;
+      }
+  
+      bool RecursiveHalving::is_finished() const
+      {
+        return state == OperatorState::FINISHED;
+      }
+  
+      Operator::RequiredResourceList RecursiveHalving::get_required_resources(
+        TensorInfo const& tensor_info, GPI::Group const& group)
+      {
+        auto const num_notifications = allreduceButterfly::getNumberOfNotifications(group.get_size());
+        auto const num_elements_data_segment = tensor_info.get_nelems();
+        auto const num_elements_temp_segment = static_cast<size_t>(
+            allreduceButterfly::getNumberOfElementsSegmentCommunicate(tensor_info.get_nelems(), group.get_size()));
+        return {{num_elements_data_segment * getDataTypeSize(tensor_info.get_elem_type()), num_notifications},
+                {num_elements_temp_segment * getDataTypeSize(tensor_info.get_elem_type()), num_notifications}};
+      }
+  
+      void* RecursiveHalving::get_input_ptr() const
+      {
+        return allreduce.getReducePointer();
+      }
+  
+      void* RecursiveHalving::get_result_ptr() const
+      {
+        return allreduce.getReducePointer();
+      }
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/allreduce/RecursiveHalving.hpp b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalving.hpp
new file mode 100644
index 00000000..1687cf20
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalving.hpp
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "Operator.hpp"
+#include "allreduceButterfly.h"
+#include "collectives/barrier/GPIBarrier.hpp"
+#include "collectives/TensorInfo.hpp"
+#include "gpi/Group.hpp"
+#include "gpi/NotificationManager.hpp"
+#include "gpi/SegmentBuffer.hpp"
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    namespace Allreduce
+    {
+      class RecursiveHalving : public Operator
+      {
+        public:
+          RecursiveHalving(TensorInfo,
+                           ReductionOp,
+                           ResourceList const&,
+                           queues&,
+                           GPI::Group const&);
+          RecursiveHalving(const RecursiveHalving&) = delete;
+          RecursiveHalving& operator=(const RecursiveHalving&) = delete;
+          ~RecursiveHalving() = default;
+  
+          void start() override;
+          void trigger_communication_step() override;
+  
+          void reset_for_reuse() override;
+          bool is_running() const override;
+          bool is_finished() const override;
+  
+          void* get_input_ptr() const override;
+          void* get_result_ptr() const override;
+  
+          static RequiredResourceList get_required_resources(TensorInfo const&, GPI::Group const&);
+  
+        private:
+          GPI::Group const& group;
+          std::atomic<OperatorState> state;
+          allreduceButterfly allreduce;
+          Barrier::GPIBarrier barrier;
+      };
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/collectives/allreduce/RecursiveHalvingDoubleBuffer.cpp b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalvingDoubleBuffer.cpp
new file mode 100644
index 00000000..0a2f842a
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalvingDoubleBuffer.cpp
@@ -0,0 +1,98 @@
+#include "RecursiveHalvingDoubleBuffer.hpp"
+
+#include "gpi/gaspiCheckReturn.hpp"
+#include "utils.hpp"
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    namespace Allreduce
+    {
+      RecursiveHalvingDoubleBuffer::RecursiveHalvingDoubleBuffer(TensorInfo tensor_info,
+                                                                 ReductionOp reduction_op,
+                                                                 ResourceList const& resource_list,
+                                                                 queues& queues,
+                                                                 GPI::Group const& group) 
+      : state(OperatorState::NOT_STARTED),
+        allreduce(tensor_info.get_nelems(),
+                  to_allreduce_dataType(tensor_info.get_elem_type()),
+                  to_allreduce_reductionType(reduction_op),
+                  to_allreduce_segment_buffer(resource_list.at(0)),
+                  to_allreduce_segment_buffer(resource_list.at(1)),
+                  to_allreduce_segment_buffer(resource_list.at(2)),
+                  queues, group)
+      { }
+
+      void RecursiveHalvingDoubleBuffer::start()
+      {
+        if (is_running())
+        {
+          throw std::logic_error("[RecursiveHalvingDoubleBuffer::start] Operation already started.");
+        }
+        if (is_finished())
+        {
+          throw std::logic_error("[RecursiveHalvingDoubleBuffer::start] Operation not reset after finish.");
+        }
+        allreduce.signal();
+        state = OperatorState::RUNNING;
+      }
+
+      void RecursiveHalvingDoubleBuffer::trigger_communication_step()
+      {
+        if (is_running())
+        {
+          auto const result = allreduce();
+          if (result == 0)
+          {
+            state = OperatorState::FINISHED;
+          }
+        }
+      }
+
+      void RecursiveHalvingDoubleBuffer::reset_for_reuse()
+      {
+        if (is_running())
+        {
+          throw std::logic_error("[RecursiveHalvingDoubleBuffer::reset] Cannot reset while running.");
+        }
+        state = OperatorState::NOT_STARTED;
+      }
+
+      bool RecursiveHalvingDoubleBuffer::is_running() const
+      {
+        return state == OperatorState::RUNNING;
+      }
+
+      bool RecursiveHalvingDoubleBuffer::is_finished() const
+      {
+        return state == OperatorState::FINISHED;
+      }
+
+      Operator::RequiredResourceList RecursiveHalvingDoubleBuffer::get_required_resources(
+          TensorInfo const& tensor_info, GPI::Group const& group)
+      {
+        auto const num_notifications = allreduceButterflyDoubleBuffer::getNumberOfNotifications(group.get_size());
+
+        auto const num_elements_data_segment = tensor_info.get_nelems();
+        auto const num_elements_temp_segment = static_cast<size_t>(
+            allreduceButterflyDoubleBuffer::getNumberOfElementsSegmentCommunicate(
+                tensor_info.get_nelems(), group.get_size()));
+
+        return {{num_elements_data_segment * getDataTypeSize(tensor_info.get_elem_type()), num_notifications},
+                {num_elements_data_segment * getDataTypeSize(tensor_info.get_elem_type()), num_notifications},
+                {num_elements_temp_segment * getDataTypeSize(tensor_info.get_elem_type()), num_notifications}};
+      }
+
+      void* RecursiveHalvingDoubleBuffer::get_input_ptr() const
+      {
+        return allreduce.getActiveReducePointer();
+      }
+
+      void* RecursiveHalvingDoubleBuffer::get_result_ptr() const
+      {
+        return allreduce.getResultsPointer();
+      }
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/allreduce/RecursiveHalvingDoubleBuffer.hpp b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalvingDoubleBuffer.hpp
new file mode 100644
index 00000000..653a891f
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/allreduce/RecursiveHalvingDoubleBuffer.hpp
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "Operator.hpp"
+#include "allreduceButterflyDoubleBuffer.h"
+#include "collectives/TensorInfo.hpp"
+#include "gpi/Group.hpp"
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    namespace Allreduce
+    {
+      class RecursiveHalvingDoubleBuffer : public Operator
+      {
+        public:
+          RecursiveHalvingDoubleBuffer(TensorInfo,
+                                      ReductionOp,
+                                      ResourceList const&,
+                                      queues&,
+                                      GPI::Group const&);
+          RecursiveHalvingDoubleBuffer(const RecursiveHalvingDoubleBuffer&) = delete;
+          RecursiveHalvingDoubleBuffer& operator=(const RecursiveHalvingDoubleBuffer&) = delete;
+          ~RecursiveHalvingDoubleBuffer() = default;
+
+          void start() override;
+          void trigger_communication_step() override;
+
+          void reset_for_reuse() override;
+          bool is_running() const override;
+          bool is_finished() const override;
+
+          virtual void* get_input_ptr() const override;
+          virtual void* get_result_ptr() const override;
+
+          static RequiredResourceList get_required_resources(TensorInfo const&, GPI::Group const& group);
+
+        private:
+          std::atomic<OperatorState> state;
+          allreduceButterflyDoubleBuffer allreduce;
+      };
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/allreduce/utils.cpp b/src/gpi_comm_lib/collectives/allreduce/utils.cpp
new file mode 100644
index 00000000..da5f5b3b
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/allreduce/utils.cpp
@@ -0,0 +1,39 @@
+#include "utils.hpp"
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    namespace Allreduce
+    {
+      allreduce::dataType to_allreduce_dataType(const BufferElementType type)
+      {
+        std::unordered_map<BufferElementType, allreduce::dataType> const types{
+            {BufferElementType::FLOAT, allreduce::FLOAT},
+            {BufferElementType::DOUBLE, allreduce::DOUBLE},
+            {BufferElementType::INT16, allreduce::INT16},
+            {BufferElementType::INT32, allreduce::INT32},
+        };
+        return types.at(type);
+      }
+
+      allreduce::reductionType to_allreduce_reductionType(const Operator::ReductionOp op)
+      {
+        std::unordered_map<Operator::ReductionOp, allreduce::reductionType> const reduction_ops{
+            {Operator::ReductionOp::SUM, allreduce::SUM},
+            {Operator::ReductionOp::AVERAGE, allreduce::AVERAGE},
+        };
+        return reduction_ops.at(op);
+      }
+
+      allreduceButterfly::segmentBuffer to_allreduce_segment_buffer(Operator::Resource const& resource)
+      {
+        auto const [data_segment_buffer, notif_range] = resource;
+        allreduceButterfly::segmentBuffer buffer{data_segment_buffer.get_segment_id(),
+                                                 data_segment_buffer.get_offset(),
+                                                 static_cast<gaspi_notification_id_t>(notif_range.first)};
+        return buffer;
+      }
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/allreduce/utils.hpp b/src/gpi_comm_lib/collectives/allreduce/utils.hpp
new file mode 100644
index 00000000..462faa51
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/allreduce/utils.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "allreduce.h"
+#include "allreduceButterfly.h"
+#include "collectives/BufferElementType.hpp"
+#include "Operator.hpp"
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    namespace Allreduce
+    {
+      allreduce::dataType to_allreduce_dataType(const BufferElementType type);
+      allreduce::reductionType to_allreduce_reductionType(
+                                              const Operator::ReductionOp op);
+      allreduceButterfly::segmentBuffer to_allreduce_segment_buffer(
+                                              Operator::Resource const &resource);
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/collectives/barrier/GPIBarrier.cpp b/src/gpi_comm_lib/collectives/barrier/GPIBarrier.cpp
new file mode 100644
index 00000000..c954a6a9
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/barrier/GPIBarrier.cpp
@@ -0,0 +1,38 @@
+#include "GPIBarrier.hpp"
+#include "gpi/gaspiCheckReturn.hpp"
+
+#include <stdexcept>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    namespace Barrier
+    {
+      GPIBarrier::GPIBarrier(GPI::Group const &group)
+      {
+        gaspi_rank_t comm_size;
+        GPI::gaspiCheckReturn(gaspi_proc_num(&comm_size),
+                              "GPIBarrier::GPIBarrier : get number of ranks");
+        if (group.get_size() != comm_size)
+        {
+          throw std::invalid_argument("GPIBarrier::GPIBarrier : can only be used with all ranks in \
+                                      the default GPI communicator");
+        }
+      }
+
+      // TODO: implement for any GPI::Group
+      void GPIBarrier::blocking_barrier()
+      {
+        GPI::gaspiCheckReturn(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK),
+                              "GPIBarrier::GPIBarrier : barrier failed");
+      }
+
+      void GPIBarrierAllRanks::blocking_barrier()
+      {
+        GPI::gaspiCheckReturn(gaspi_barrier(GASPI_GROUP_ALL, GASPI_BLOCK),
+                              "GPIBarrierAllRanks::GPIBarrierAllRanks : barrier failed");
+      }
+    } 
+  }   
+} 
diff --git a/src/gpi_comm_lib/collectives/barrier/GPIBarrier.hpp b/src/gpi_comm_lib/collectives/barrier/GPIBarrier.hpp
new file mode 100644
index 00000000..3ac367a1
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/barrier/GPIBarrier.hpp
@@ -0,0 +1,30 @@
+#pragma once
+
+#include "gpi/Group.hpp"
+#include "Operator.hpp"
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    namespace Barrier
+    {
+      // GPI Barrier implementation for GROUP_COMM_ALL
+      class GPIBarrier : public Operator
+      {
+        public:
+        
+          GPIBarrier(GPI::Group const & group);
+          void blocking_barrier();
+      };
+
+      class GPIBarrierAllRanks : public Operator
+      {
+        public:
+
+          GPIBarrierAllRanks() = default;
+          void blocking_barrier();
+      };
+    } 
+  }   
+} 
diff --git a/src/gpi_comm_lib/collectives/barrier/Operator.hpp b/src/gpi_comm_lib/collectives/barrier/Operator.hpp
new file mode 100644
index 00000000..81e863bb
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/barrier/Operator.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    namespace Barrier
+    {
+      // \note
+      // Interface for Barrier algorithms (not thread-safe)
+      class Operator
+      {
+      public:
+        virtual ~Operator() = default;
+
+        virtual void blocking_barrier() = 0;
+      };
+    } 
+  }   
+} 
diff --git a/src/gpi_comm_lib/collectives/lib/allreduce.h b/src/gpi_comm_lib/collectives/lib/allreduce.h
new file mode 100755
index 00000000..b9f12fe5
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/allreduce.h
@@ -0,0 +1,27 @@
+#pragma once
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    class allreduce {
+    public:
+      enum reductionType {
+        SUM = 0,
+        AVERAGE = 1,
+        NUM_RED = 2
+      };
+      enum dataType {
+        FLOAT = 0,
+        DOUBLE = 1,
+        INT16 = 2,
+        INT32 = 3,
+        NUM_TYPE = 4
+      };
+
+      virtual int operator()() = 0;
+      virtual void signal() = 0;
+      virtual ~allreduce() {}
+    };
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/lib/allreduceButterfly.cpp b/src/gpi_comm_lib/collectives/lib/allreduceButterfly.cpp
new file mode 100755
index 00000000..26fb2b58
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/allreduceButterfly.cpp
@@ -0,0 +1,418 @@
+#include "allreduceButterfly.h"
+#include "gpi/gaspiCheckReturn.hpp"
+#include "mailBoxGaspi.h"
+#include "gpi/Group.hpp"
+
+#include <vector>
+#include <algorithm>
+#include <cmath>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    using tarantella::GPI::gaspiCheckReturn;
+  
+    nestedRingParameter::nestedRingParameter(const rankIndexType numRanks_,
+                                             const rankIndexType rank_) :
+      numRanks(numRanks_),
+      rank(rank_),
+      ringSizes(getRingSizes(numRanks)),
+      strides(getStrides(ringSizes)),
+      ringIndices(getRingIndices(ringSizes, rank_)) {}
+  
+    inline nestedRingParameter::ringSizesType nestedRingParameter::getRingSizes(
+      rankIndexType numRanks) {
+      ringSizesType s;
+  
+      unsigned long limit = std::sqrt(numRanks) + 2;
+  
+      for (unsigned long factor=2; factor < limit; factor++) {
+        while ((numRanks % factor) == 0) {
+          s.push_back(factor);
+          numRanks /= factor;
+        }
+      }
+  
+      if (numRanks > 1) {
+        s.push_back(numRanks);
+      }
+  
+      return s;
+    }
+  
+    inline nestedRingParameter::stridesType nestedRingParameter::getStrides(
+      const ringSizesType& ringSizes) {
+      const long numLevels = ringSizes.size();
+      stridesType s(ringSizes.size());
+      unsigned long factor = 1;
+      for (long level=numLevels - 1; level >= 0; level--) {
+        s[level] = factor;
+        factor *= ringSizes[level];
+      }
+  
+      return s;
+    }
+  
+    inline nestedRingParameter::ringIndicesType
+    nestedRingParameter::getRingIndices(const ringSizesType& ringSizes,
+                                        const rankIndexType rank) {
+      ringIndicesType indices;
+  
+      rankIndexType product = 1;
+      for (unsigned long i=0; i < ringSizes.size(); i++) {
+        indices.push_back((rank / product) % ringSizes[i]);
+        product *= ringSizes[i];
+      }
+  
+      return indices;
+    }
+  
+    nestedRingParameter::rankIndexType
+    nestedRingParameter::getNumberOfRings() const{
+      return ringSizes.size();
+    }
+  
+    nestedRingParameter::rankIndexType nestedRingParameter::getRingLength(
+      const levelType level) const {
+      return ringSizes[level];
+    }
+  
+    nestedRingParameter::rankIndexType nestedRingParameter::getLocalRankInRing(
+      const levelType level) const {
+      return ringIndices[level];
+    }
+  
+    nestedRingParameter::rankIndexType
+    nestedRingParameter::getGlobalRankToWriteInRing(
+      const levelType level) const {
+      long numLevels = ringSizes.size();
+      rankIndexType r = 0;
+      for (long i=numLevels - 1; i > long(level); i--) {
+        r = ringIndices[i] + ringSizes[i] * r;
+      }
+      const rankIndexType next = (ringIndices[level] + 1) % ringSizes[level];
+      r =  next + ringSizes[level] * r;
+      for (long i=long(level) - 1; i >= 0; i--) {
+        r = ringIndices[i] + ringSizes[i] * r;
+      }
+      return r;
+    }
+  
+    nestedRingParameter::bufferIndexType nestedRingParameter::getBufferLength(
+      const levelType level) const {
+      return strides[level];
+    }
+  
+    nestedRingParameter::bufferIndexType nestedRingParameter::getBufferStart(
+      const levelType level,
+      const bufferIndexType buffer) const {
+      // we assume that each global rank aggregates on each level the buffer
+      // that matches the local ring id. This buffer is
+      // I.E. getBufferStart(level, getRankInRing(level))
+      // -> getBufferStart(level, getRankInRing(level)) + getBufferLength(level)
+  
+      bufferIndexType s = 0;
+      for (unsigned long i=0; i < level; i++) {
+        s += ringIndices[i] * strides[i];
+      }
+      s += buffer * strides[level];
+  
+      return s;
+    }
+  
+    allreduceButterfly::allreduceButterfly(
+      const long len,
+      const dataType data,
+      const reductionType reduction,
+      const segmentBuffer locationReduce_,
+      const segmentBuffer locationCommunicate_,
+      queues& queues_,
+      GPI::Group const& group_
+      )
+    :  totalLength(len),
+       dataElement(data),
+       group(group_),
+       numRanks(getNumRanks()),
+       rank(getRank()),
+       locationReduce(locationReduce_),
+       locationReducePointer(getSegmentPointer(locationReduce_.segment)
+                             + locationReduce_.offset),
+       locationCommunicate(locationCommunicate_),
+       topology(numRanks, getRankIndex(rank, getRanks())),
+       sender(queues_),
+       reducer(getReduce(data, reduction)),
+       status(2 * getNumberOfNotifications(numRanks) + 1){
+  
+      std::vector<gaspi_rank_t> ranks = getRanks();
+  
+      setReduceScatter(ranks);
+      setAllToAll(ranks);
+    }
+  
+    long allreduceButterfly::getNumRanks() const {
+      return group.get_size();
+    }
+  
+    long allreduceButterfly::getRank() {
+      gaspi_rank_t rank;
+      gaspiCheckReturn(gaspi_proc_rank(&rank),
+                       "gaspi_proc_rank failed with ");
+      return rank;
+    }
+  
+    std::vector<gaspi_rank_t> allreduceButterfly::getRanks() const {
+      return group.get_ranks();
+    }
+  
+    unsigned long allreduceButterfly::getRankIndex(
+      gaspi_rank_t rank,
+      const std::vector<gaspi_rank_t>& ranks) {
+      unsigned long rankIndex;
+      if (find(ranks.begin(), ranks.end(), rank) == ranks.end()) {
+        throw std::runtime_error("rank not member of group");
+      } else {
+        rankIndex = find(ranks.begin(), ranks.end(), rank)
+                  - ranks.begin();
+      }
+      return rankIndex;
+    }
+  
+    void allreduceButterfly::setReduceScatter(
+      const std::vector<gaspi_rank_t>& ranks) {
+      gaspi_notification_id_t nextNotification
+        = locationCommunicate.firstNotification;
+      gaspi_offset_t nextLocalCommunicationBufferByte = 0;
+      const char* const reductionSourceBasePointer =
+        getSegmentPointer(locationCommunicate.segment)
+        + locationCommunicate.offset;
+      char* const reductionDestinationBasePointer =
+        getSegmentPointer(locationReduce.segment)
+        + locationReduce.offset;
+  
+      receiver.push_back(&trigger);
+      jobs.push_back(jobType());
+  
+      for (unsigned long ring=0; ring < topology.getNumberOfRings(); ring++) {
+  
+        const rankIndexType ringLength = topology.getRingLength(ring);
+        const rankIndexType ringRank = topology.getLocalRankInRing(ring);
+        const bufferIndexType bufferLengthIndex = topology.getBufferLength(ring);
+        const gaspi_rank_t outgoingGlobalRank =
+          ranks[topology.getGlobalRankToWriteInRing(ring)];
+        gaspi_offset_t nextRemoteCommunicationBufferByte
+          = nextLocalCommunicationBufferByte;
+  
+  
+        for (unsigned long loop=0; loop < ringLength - 1; loop++) {
+          const unsigned long currentJob = receiver.size() - 1;
+          receiver.push_back(
+            new mailBoxGaspi(locationCommunicate.segment, nextNotification));
+          jobs.push_back(jobType());
+  
+          const bufferIndexType sendBufferID =
+            (ringRank + ringLength - loop - 1) % ringLength;
+          const bufferIndexType sendStartIndex =
+            topology.getBufferStart(ring, sendBufferID);
+          const gaspi_offset_t sendStartByte =
+            chunkIndexToByte(sendStartIndex);
+          const long sendLengthByte =
+            chunkIndexToByte(sendStartIndex + bufferLengthIndex)
+            - sendStartByte;
+          const writer::transferParameters transfer(
+            true,
+            outgoingGlobalRank,
+            locationReduce.segment,
+            locationReduce.offset + sendStartByte,
+            locationCommunicate.segment,
+            locationCommunicate.offset + nextRemoteCommunicationBufferByte,
+            sendLengthByte,
+            nextNotification);
+          jobs[currentJob].second = transfer;
+  
+          const bufferIndexType receiveBufferID =
+            (ringRank + ringLength - loop - 2) % ringLength;
+          const bufferIndexType receiveStartIndex =
+            topology.getBufferStart(ring, receiveBufferID);
+          const gaspi_offset_t receiveStartByte =
+            chunkIndexToByte(receiveStartIndex);
+          const long receiveLengthByte =
+            chunkIndexToByte(receiveStartIndex + bufferLengthIndex)
+            - receiveStartByte;
+          const reduce::task copy(
+            reductionSourceBasePointer + nextLocalCommunicationBufferByte,
+            reductionDestinationBasePointer + receiveStartByte,
+            receiveLengthByte / getDataTypeSize(dataElement));
+          jobs[currentJob + 1].first = copy;
+  
+          nextNotification++;
+          nextRemoteCommunicationBufferByte += sendLengthByte;
+          nextLocalCommunicationBufferByte += receiveLengthByte;
+        }
+      }
+  
+      jobs.back().first.scaling = numRanks;
+    }
+  
+    inline char* allreduceButterfly::getSegmentPointer(
+      const gaspi_segment_id_t segment) {
+      gaspi_pointer_t p;
+      gaspiCheckReturn(gaspi_segment_ptr(segment, &p),
+                       "failed getting segment pointer");
+      return (char*) p;
+    }
+  
+    inline unsigned long allreduceButterfly::chunkIndexToByte(
+      const long chunkIndex) const {
+      return ((totalLength * chunkIndex + numRanks - 1) / numRanks)
+             * getDataTypeSize(dataElement);
+    }
+  
+    void allreduceButterfly::setAllToAll(
+      const std::vector<gaspi_rank_t>& ranks) {
+      gaspi_notification_id_t nextNotification = locationReduce.firstNotification;
+  
+      for (long ring=topology.getNumberOfRings() - 1; ring >=0 ; ring--) {
+  
+        const rankIndexType ringLength = topology.getRingLength(ring);
+        const rankIndexType ringRank = topology.getLocalRankInRing(ring);
+        const bufferIndexType bufferLengthIndex = topology.getBufferLength(ring);
+        const gaspi_rank_t outgoingGlobalRank =
+          ranks[topology.getGlobalRankToWriteInRing(ring)];
+  
+        for (unsigned long loop=0; loop < ringLength - 1; loop++) {
+          const unsigned long currentJob = receiver.size() - 1;
+          receiver.push_back(
+            new mailBoxGaspi(locationReduce.segment, nextNotification));
+          jobs.push_back(jobType());
+  
+          const bufferIndexType transferBufferID =
+            (ringRank + ringLength - loop) % ringLength;
+          const bufferIndexType transferStartIndex =
+            topology.getBufferStart(ring, transferBufferID);
+          const gaspi_offset_t transferStartByte =
+            chunkIndexToByte(transferStartIndex);
+          const long transferLengthByte =
+            chunkIndexToByte(transferStartIndex + bufferLengthIndex)
+            - transferStartByte;
+  
+          const writer::transferParameters transfer(
+            true,
+            outgoingGlobalRank,
+            locationReduce.segment,
+            locationReduce.offset + transferStartByte,
+            locationReduce.segment,
+            locationReduce.offset + transferStartByte,
+            transferLengthByte,
+            nextNotification);
+          jobs[currentJob].second = transfer;
+  
+          nextNotification++;
+        }
+      }
+    }
+  
+    allreduceButterfly::~allreduceButterfly() {
+      delete reducer;
+      for (unsigned long i=1; i < receiver.size(); i++) {
+        delete receiver[i];
+      }
+    }
+  
+    int allreduceButterfly::operator()() {
+      const unsigned long phase = status.get();
+      // could be a problem if we overtake one iteration?
+      if (!receiver[phase]->gotNotification()) {
+        return -1;
+      }
+  
+      reducer->operator()(jobs[phase].first);
+      // hier schon freigeben?
+      sender(jobs[phase].second);
+  
+      return (status.increment() == 0) ? 0 : -1;
+    }
+  
+    void allreduceButterfly::signal() {
+      trigger.notify();
+    }
+  
+    gaspi_pointer_t allreduceButterfly::getReducePointer() const {
+      return locationReducePointer;
+    }
+  
+    long allreduceButterfly::getNumberOfElementsSegmentCommunicate(
+      const long len,
+      const long numRanks) {
+      return ((len + numRanks - 1) / numRanks) * (numRanks - 1);
+    }
+  
+    unsigned long allreduceButterfly::getNumberOfNotifications(
+      const long numRanks) {
+      const nestedRingParameter topology(numRanks);
+  
+      gaspi_notification_id_t notifications = 0;
+      for (unsigned long i=0; i < topology.getNumberOfRings(); i++) {
+        notifications += topology.getRingLength(i) - 1;
+      }
+  
+      return notifications;
+    }
+  
+    std::ostream& allreduceButterfly::report(std::ostream& s) const {
+      char* pr = getSegmentPointer(locationReduce.segment);
+      char* pc = getSegmentPointer(locationCommunicate.segment);
+      const unsigned long phase = status.get();
+      s << "total length: " << totalLength << std::endl
+        << "dataElement: " << dataElement << std::endl
+        << "numRanks: " << numRanks  << std::endl
+        << "rank: " << rank << std::endl
+        << "topology.getNumberOfRings" << topology.getNumberOfRings() << std::endl
+        << "getNumberOfNotifications(): "
+        << getNumberOfNotifications(numRanks) << std::endl
+        << "segmentReduce: " << long(locationReduce.segment) << std::endl
+        << "offsetReduce: " << locationReduce.offset << std::endl
+        << "firstNotificationReduce: " << locationReduce.firstNotification
+        << std::endl
+        << "segmentCommunicate: " << long(locationCommunicate.segment)
+        << std::endl
+        << "offsetCommunicate: " << locationCommunicate.offset << std::endl
+        << "firstNotificationCommunicate: "
+        << locationCommunicate.firstNotification << std::endl
+        << "pointer segment reduce     : "
+        << (void*)getSegmentPointer(locationReduce.segment) << std::endl
+        << "pointer segment communicate: "
+        << (void*)getSegmentPointer(locationCommunicate.segment) << std::endl
+        << "phase " << phase << std::endl;
+      for (unsigned long i=0; i < jobs.size(); i++) {
+        s << ".........................." << std::endl;
+        s << "phase " << i << std::endl;
+        if (i==0) {
+          s << "Receiver: " << "user" << std::endl;
+        } else {
+          mailBoxGaspi* m = (mailBoxGaspi*) receiver[i];
+          s << "Receiver: segment " << long(m->getSegmentID())
+            << " notification ID " << m->getMailID() << std::endl;
+        }
+  
+        if (jobs[i].first.len > 0) {
+          s << "Reduce  : src " << jobs[i].first.source
+            << " (" << (char*)jobs[i].first.source - pc << ")"
+            << " dst " << jobs[i].first.destination
+            << " (" << (char*)jobs[i].first.destination - pr << ")"
+            << " ele " << jobs[i].first.len
+            << " (" << jobs[i].first.len * getDataTypeSize(dataElement) << ")"
+            << std::endl;
+        } else {
+          s << "Reduce  : idle" << std::endl;
+        }
+  
+        s << "Send    : ";
+        jobs[i].second.report(s) << std::endl;
+      }
+      s << ".........................." << std::endl;
+  
+      return s;
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/lib/allreduceButterfly.h b/src/gpi_comm_lib/collectives/lib/allreduceButterfly.h
new file mode 100644
index 00000000..194bac0a
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/allreduceButterfly.h
@@ -0,0 +1,118 @@
+#pragma once
+
+#include "allreduce.h"
+#include "counter.h"
+#include "gpi/Group.hpp"
+#include "mailBox.h"
+#include "mailBoxLocal.h"
+#include "queues.h"
+#include "reduce.h"
+#include "writer.h"
+
+#include <GASPI.h>
+
+#include <iostream>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    class nestedRingParameter {
+    public:
+      typedef unsigned long rankIndexType;
+      typedef unsigned long levelType;
+      typedef unsigned long bufferIndexType;
+    
+      nestedRingParameter(const rankIndexType numRanks_,
+                          const rankIndexType rank_=0);
+    
+      rankIndexType getNumberOfRings() const;
+      rankIndexType getRingLength(const levelType level) const;
+      rankIndexType getLocalRankInRing(const levelType level) const;
+      rankIndexType getGlobalRankToWriteInRing(const levelType level) const;
+      bufferIndexType getBufferLength(const levelType level) const;
+      bufferIndexType getBufferStart(const levelType level,
+                                   const bufferIndexType buffer) const;
+    
+    private:
+    
+      typedef std::vector<unsigned long> ringIndicesType;
+      typedef std::vector<unsigned long> ringSizesType;
+      typedef std::vector<unsigned long> stridesType;
+    
+      static inline ringSizesType getRingSizes(rankIndexType numRanks);
+      static inline stridesType getStrides(const ringSizesType& ringSizes);
+      static inline ringIndicesType getRingIndices(const ringSizesType& ringSizes,
+                                                   const rankIndexType rank);
+    
+      const rankIndexType numRanks;
+      const rankIndexType rank;
+      const ringSizesType ringSizes;
+      const stridesType strides;
+      const ringIndicesType ringIndices;
+    };
+    
+    class allreduceButterfly : public allreduce {
+    public:
+    
+      struct segmentBuffer {
+        gaspi_segment_id_t segment;
+        gaspi_offset_t offset;
+        gaspi_notification_id_t firstNotification;
+      };
+    
+      allreduceButterfly(const long len,
+                        const dataType data,
+                        const reductionType reduction,
+                        const segmentBuffer segmentReduce,
+                        const segmentBuffer segmentCommunicate,
+                        queues& queues_,
+                        GPI::Group const& group_);
+      ~allreduceButterfly();
+      int operator()();
+      void signal();
+    
+      gaspi_pointer_t getReducePointer() const;
+      static long getNumberOfElementsSegmentCommunicate(const long len,
+                                                        const long numRanks);
+      static unsigned long getNumberOfNotifications(const long numRanks);
+      std::ostream& report(std::ostream& s) const;
+    
+    private:
+    
+      typedef nestedRingParameter::rankIndexType rankIndexType;
+      typedef nestedRingParameter::bufferIndexType bufferIndexType;
+      typedef std::pair<reduce::task, writer::transferParameters> jobType;
+      
+      inline long getNumRanks() const;
+      static inline long getRank();
+      std::vector<gaspi_rank_t> getRanks() const;
+      static inline rankIndexType getRankIndex(
+        gaspi_rank_t rank,
+        const std::vector<gaspi_rank_t>& ranks);
+      void setReduceScatter(const std::vector<gaspi_rank_t>& ranks);
+      inline static char* getSegmentPointer(const gaspi_segment_id_t segment);
+      inline unsigned long chunkIndexToByte(const long chunkIndex) const;
+      void setAllToAll(const std::vector<gaspi_rank_t>& ranks);
+    
+      const long totalLength;
+      const dataType dataElement;
+      GPI::Group const group;
+      const long numRanks;
+      const gaspi_rank_t rank;
+      const segmentBuffer locationReduce;
+      const gaspi_pointer_t locationReducePointer;
+      const segmentBuffer locationCommunicate;
+    
+      const nestedRingParameter topology;
+    
+      mailBoxLocal trigger;
+      std::vector<mailBox *> receiver;
+      std::vector<jobType> jobs;
+    
+      writer sender;
+      reduce * reducer;
+      counter status;
+    };
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/lib/allreduceButterflyDoubleBuffer.cpp b/src/gpi_comm_lib/collectives/lib/allreduceButterflyDoubleBuffer.cpp
new file mode 100755
index 00000000..484b5268
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/allreduceButterflyDoubleBuffer.cpp
@@ -0,0 +1,91 @@
+#include "allreduceButterflyDoubleBuffer.h"
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    allreduceButterflyDoubleBuffer::allreduceButterflyDoubleBuffer(
+      const long len,
+      const dataType data,
+      const reductionType reduction,
+      const allreduceButterfly::segmentBuffer segmentReduce0,
+      const allreduceButterfly::segmentBuffer segmentReduce1,
+      const allreduceButterfly::segmentBuffer segmentCommunicate,
+      queues& queues,
+      GPI::Group const& group)
+    : state(0),
+      reduceFirst(len, data, reduction, segmentReduce0,
+                  segmentCommunicate, queues, group),
+      reduceSecond(len, data, reduction, segmentReduce1,
+                   segmentCommunicate, queues, group) {
+      tableReduce[0] = &reduceFirst;
+      tableReduce[1] = &reduceSecond;
+    }
+    
+    int allreduceButterflyDoubleBuffer::operator()() {
+      const int result = getReduce()();
+    
+      if (!result) {
+        flipReduce();
+      }
+    
+      return result;
+    }
+    
+    inline allreduceButterfly& allreduceButterflyDoubleBuffer::getReduce() const {
+      return tableReduce[stateToIndex(state)][0];
+    }
+    
+    inline long allreduceButterflyDoubleBuffer::stateToIndex(const long state) {
+      return state & 1l;
+    }
+    
+    inline void allreduceButterflyDoubleBuffer::flipReduce() {
+      __sync_fetch_and_add(&state, 1l);
+    }
+    
+    void allreduceButterflyDoubleBuffer::signal() {
+      getReduce().signal();
+    }
+    
+    gaspi_pointer_t allreduceButterflyDoubleBuffer::getActiveReducePointer() const {
+      return getReduce().getReducePointer();
+    }
+    
+    gaspi_pointer_t allreduceButterflyDoubleBuffer::getResultsPointer() const {
+      return getOtherReduce().getReducePointer();
+    }
+    
+    inline const allreduceButterfly&
+      allreduceButterflyDoubleBuffer::getOtherReduce() const {
+      return tableReduce[invertIndex(stateToIndex(state))][0];
+    }
+    
+    inline long allreduceButterflyDoubleBuffer::invertIndex(const long state) {
+      return state ^ 1l;
+    }
+    
+    long allreduceButterflyDoubleBuffer::getNumberOfElementsSegmentCommunicate(
+      const long len,
+      const long numRanks) {
+      return allreduceButterfly::getNumberOfElementsSegmentCommunicate(len,
+                                                                      numRanks);
+    }
+    
+    unsigned long allreduceButterflyDoubleBuffer::getNumberOfNotifications(
+      const long numRanks) {
+      return allreduceButterfly::getNumberOfNotifications(numRanks);
+    }
+    
+    std::ostream& allreduceButterflyDoubleBuffer::report(std::ostream& s) const {
+      s << "stateExecute: " << state << std::endl
+        << "***** reduceFirst *****" << std::endl;
+      reduceFirst.report(s);
+      s << "***** reduceSecond *****" << std::endl;
+      reduceSecond.report(s);
+    
+      return s;
+    }
+  }
+}
+    
\ No newline at end of file
diff --git a/src/gpi_comm_lib/collectives/lib/allreduceButterflyDoubleBuffer.h b/src/gpi_comm_lib/collectives/lib/allreduceButterflyDoubleBuffer.h
new file mode 100755
index 00000000..e4f503b7
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/allreduceButterflyDoubleBuffer.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include "allreduceButterfly.h"
+#include "gpi/Group.hpp"
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    class allreduceButterflyDoubleBuffer : public allreduce {
+    public:
+    
+      allreduceButterflyDoubleBuffer(
+        const long len,
+        const dataType data,
+        const reductionType reduction,
+        const allreduceButterfly::segmentBuffer segmentReduce0,
+        const allreduceButterfly::segmentBuffer segmentReduce1,
+        const allreduceButterfly::segmentBuffer segmentCommunicate,
+        queues& queues,
+        GPI::Group const& group);
+      int operator()();
+      void signal();
+    
+      gaspi_pointer_t getActiveReducePointer() const;
+      gaspi_pointer_t getResultsPointer() const;
+      static long getNumberOfElementsSegmentCommunicate(const long len,
+                                                        const long numRanks);
+      static unsigned long getNumberOfNotifications(const long numRanks);
+      std::ostream& report(std::ostream& s) const;
+    
+    private:
+    
+      inline allreduceButterfly& getReduce() const;
+      static inline long stateToIndex(const long state);
+      inline void flipReduce();
+      inline const allreduceButterfly& getOtherReduce() const;
+      static inline long invertIndex(const long state);
+    
+      static const long CACHE_LINE_SIZE = 64;
+    
+      char pad0[CACHE_LINE_SIZE];
+      volatile long state;
+      char pad1[CACHE_LINE_SIZE];
+    
+      allreduceButterfly reduceFirst;
+      allreduceButterfly reduceSecond;
+      allreduceButterfly* tableReduce[2];
+    };
+  }
+}
+    
\ No newline at end of file
diff --git a/src/gpi_comm_lib/collectives/lib/broadcast.cpp b/src/gpi_comm_lib/collectives/lib/broadcast.cpp
new file mode 100755
index 00000000..c1b814e1
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/broadcast.cpp
@@ -0,0 +1,197 @@
+#include "broadcast.h"
+#include "gpi/gaspiCheckReturn.hpp"
+#include "mailBoxGaspi.h"
+
+#include <vector>
+#include <algorithm>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    using tarantella::GPI::gaspiCheckReturn;
+    
+    broadcast::broadcast(
+      const gaspi_rank_t master_,
+      const long len,
+      const gaspi_segment_id_t segment_,
+      const gaspi_offset_t offset_,
+      const gaspi_notification_id_t firstNotification_,
+      queues& queues_ )
+    :  totalLength(len),
+       group(GASPI_GROUP_ALL),
+       numRanks(getNumRanks()),
+       rank(getRank()),
+       masterRank(master_),
+       segment(segment_),
+       offset(offset_),
+       firstNotification(firstNotification_),
+       sender(queues_),
+       status((rank == masterRank) ? 1 : numRanks){
+    
+      std::vector<gaspi_rank_t> ranks(numRanks);
+      gaspiCheckReturn(gaspi_group_ranks(group, &ranks[0]),
+                       "gaspi_group_ranks failed with");
+      const unsigned long rankIndex = getRankIndex(rank, ranks);
+    
+      if (rank == masterRank) {
+        setMaster(rankIndex, ranks);
+      } else {
+        setWorker(rankIndex, ranks);
+      }
+    }
+    
+    long broadcast::getNumRanks() const {
+      gaspi_number_t size;
+      gaspiCheckReturn(gaspi_group_size(group, &size),
+                       "gaspi_group_size failed with ");
+      return size;
+    }
+    
+    long broadcast::getRank() {
+      gaspi_rank_t rank;
+      gaspiCheckReturn(gaspi_proc_rank(&rank),
+                       "gaspi_proc_rank failed with ");
+      return rank;
+    }
+    
+    long broadcast::getRankIndex(gaspi_rank_t rank,
+                                 const std::vector<gaspi_rank_t>& ranks) {
+      unsigned long rankIndex;
+      if (find(ranks.begin(), ranks.end(), rank) == ranks.end()) {
+        throw std::runtime_error("rank not member of group");
+      } else {
+        rankIndex = find(ranks.begin(), ranks.end(), rank)
+                  - ranks.begin();
+      }
+      return rankIndex;
+    }
+    
+    void broadcast::setMaster(
+      const unsigned long rankIndex,
+      const std::vector<gaspi_rank_t>& ranks) {
+      const gaspi_rank_t partner = ranks[getPartnerIndex(rankIndex)];
+    
+      receiver.push_back(&trigger);
+    
+      if (partner != rank) {
+        for (long c=0; c < numRanks; c++) {
+          writer::transferParameters job(
+            true,
+            partner,
+            segment,
+            offset + chunkIndexToByte(c),
+            segment,
+            offset + chunkIndexToByte(c),
+            chunkIndexToByte(c + 1) - chunkIndexToByte(c),
+            firstNotification + c);
+          jobs.push_back(job);
+        }
+      }
+    }
+    
+    inline unsigned long broadcast::getPartnerIndex(
+      const unsigned long rankIndex) const {
+      return (rankIndex + 1) % numRanks;
+    }
+    
+    void broadcast::setWorker(
+      const unsigned long rankIndex,
+      const std::vector<gaspi_rank_t>& ranks) {
+      const gaspi_rank_t partner = ranks[getPartnerIndex(rankIndex)];
+    
+      for (long c=0; c < numRanks; c++) {
+        receiver.push_back(
+          new mailBoxGaspi(segment, firstNotification + c));
+    
+        if (partner == masterRank) {
+          jobs.push_back(writer::transferParameters());
+        } else {
+          writer::transferParameters transfer(
+            true,
+            partner,
+            segment,
+            offset + chunkIndexToByte(c),
+            segment,
+            offset + chunkIndexToByte(c),
+            chunkIndexToByte(c + 1) - chunkIndexToByte(c),
+            firstNotification + c);
+          jobs.push_back(transfer);
+        }
+      }
+    }
+    
+    inline unsigned long broadcast::chunkIndexToByte(
+      const long chunkIndex) const {
+      return ((totalLength * chunkIndex + numRanks - 1) / numRanks);
+    }
+    
+    broadcast::~broadcast() {
+      if (rank != masterRank) {
+        for (unsigned long i=0; i < receiver.size(); i++) {
+          delete receiver[i];
+        }
+      }
+    }
+    
+    int broadcast::operator()() {
+      const unsigned long phase = status.get();
+      if (!receiver[phase]->gotNotification()) {
+        return -1;
+      }
+    
+      if (rank == masterRank) {
+        for (unsigned long i=0; i < jobs.size(); i++) {
+          sender(jobs[i]);
+        }
+      } else {
+        sender(jobs[phase]);
+      }
+    
+      return (status.increment() == 0) ? 0 : -1;
+    }
+    
+    void broadcast::signal() {
+      trigger.notify();
+    }
+    
+    long broadcast::getNumberOfNotifications(const long numRanks) {
+      return (numRanks > 1) ? numRanks : 0;
+    }
+    
+    std::ostream& broadcast::report(std::ostream& s) const {
+      const unsigned long phase = status.get();
+      s << "total length: " << totalLength << std::endl
+        << "numRanks: " << numRanks  << std::endl
+        << "rank: " << rank << std::endl
+        << "masterRank: " << masterRank << std::endl
+        << "segment: " << long(segment) << std::endl
+        << "offset: " << offset << std::endl
+        << "firstNotification: " << firstNotification << std::endl
+        << std::endl
+        << "phase " << phase << std::endl;
+      for (unsigned long i=0; i < jobs.size(); i++) {
+        s << ".........................." << std::endl;
+        s << "phase " << i << std::endl;
+        if ((i==0) && (rank == masterRank)) {
+          s << "Receiver: " << "user" << std::endl;
+        } else {
+          if (i < receiver.size()) {
+            mailBoxGaspi* m = (mailBoxGaspi*) receiver[i];
+            s << "Receiver: segment " << long(m->getSegmentID())
+              << " notification ID " << m->getMailID() << std::endl;
+          } else {
+            s << "Receiver: idle" << std::endl;
+          }
+        }
+    
+        s << "Send    : ";
+        jobs[i].report(s) << std::endl;
+      }
+      s << ".........................." << std::endl;
+    
+      return s;
+    }
+  }
+}
+    
\ No newline at end of file
diff --git a/src/gpi_comm_lib/collectives/lib/broadcast.h b/src/gpi_comm_lib/collectives/lib/broadcast.h
new file mode 100755
index 00000000..e59e0e89
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/broadcast.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include "writer.h"
+#include "mailBox.h"
+#include "mailBoxLocal.h"
+#include "counter.h"
+#include "queues.h"
+
+#include <GASPI.h>
+#include <iostream>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    class broadcast {
+    public:
+      broadcast(const gaspi_rank_t master_,
+                const long len,
+                const gaspi_segment_id_t segment_,
+                const gaspi_offset_t offset_,
+                const gaspi_notification_id_t firstNotification_,
+                queues& queues_);
+      ~broadcast();
+      int operator()();
+      void signal();
+      static long getNumberOfNotifications(const long numRanks);
+      std::ostream& report(std::ostream& s) const;
+    
+    private:
+    
+      long getNumRanks() const;
+      static long getRank();
+      static long getRankIndex(gaspi_rank_t rank,
+                               const std::vector<gaspi_rank_t>& ranks);
+      void setMaster(const unsigned long rankIndex,
+                     const std::vector<gaspi_rank_t>& ranks);
+      inline unsigned long getPartnerIndex(const unsigned long rankIndex) const;
+      void setWorker(const unsigned long rankIndex,
+                     const std::vector<gaspi_rank_t>& ranks);
+      inline unsigned long chunkIndexToByte(const long chunkIndex) const;
+      inline static char* getSegmentPointer(const gaspi_segment_id_t segment);
+    
+      const long totalLength;
+      const gaspi_group_t group;
+      const long numRanks;
+      const gaspi_rank_t rank;
+      const gaspi_rank_t masterRank;
+      const gaspi_segment_id_t segment;
+      const gaspi_offset_t offset;
+      const gaspi_notification_id_t firstNotification;
+    
+      mailBoxLocal trigger;
+      std::vector<mailBox *> receiver;
+      std::vector<writer::transferParameters> jobs;
+    
+      writer sender;
+      counter status;
+    };
+  }
+}
+    
\ No newline at end of file
diff --git a/src/gpi_comm_lib/collectives/lib/counter.cpp b/src/gpi_comm_lib/collectives/lib/counter.cpp
new file mode 100755
index 00000000..f36ea582
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/counter.cpp
@@ -0,0 +1,19 @@
+#include "counter.h"
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    counter::counter(const unsigned long phasePeriod_)
+    :  phasePeriod(phasePeriod_),
+       value(0) {}
+    
+    unsigned long counter::increment() {
+      return (++value) % phasePeriod;
+    }
+    
+    unsigned long counter::get() const {
+      return value % phasePeriod;
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/lib/counter.h b/src/gpi_comm_lib/collectives/lib/counter.h
new file mode 100755
index 00000000..5a630592
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/counter.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include<atomic>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    class counter {
+    public:
+      counter(const unsigned long phasePeriod_ = 1);
+      unsigned long increment();
+      unsigned long get() const;
+    private:
+    
+      const unsigned long phasePeriod;
+      std::atomic<unsigned long> value;
+    };
+  }
+}
+    
\ No newline at end of file
diff --git a/src/gpi_comm_lib/collectives/lib/mailBox.h b/src/gpi_comm_lib/collectives/lib/mailBox.h
new file mode 100755
index 00000000..a7d9a9d2
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/mailBox.h
@@ -0,0 +1,14 @@
+#pragma once
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    class mailBox 
+    {
+      public:
+        virtual bool gotNotification() = 0;
+        virtual ~mailBox() = default;
+    };
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/lib/mailBoxGaspi.cpp b/src/gpi_comm_lib/collectives/lib/mailBoxGaspi.cpp
new file mode 100755
index 00000000..3e2b3738
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/mailBoxGaspi.cpp
@@ -0,0 +1,47 @@
+#include "mailBoxGaspi.h"
+#include "gpi/gaspiCheckReturn.hpp"
+
+#include <cassert>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    using tarantella::GPI::gaspiCheckReturn;
+
+    mailBoxGaspi::mailBoxGaspi(const gaspi_segment_id_t segmentID_,
+                              const gaspi_notification_id_t mailID_)
+    : segmentID(segmentID_),
+      mailID(mailID_) {}
+
+    bool mailBoxGaspi::gotNotification() {
+      gaspi_notification_id_t event;
+      gaspi_return_t err = gaspi_notify_waitsome(segmentID,
+                                                mailID,
+                                                1,
+                                                &event,
+                                                GASPI_TEST);
+      if (err == GASPI_TIMEOUT)
+      {
+        return false;
+      }
+      gaspiCheckReturn(err, "gaspi_notify_waitsome failed with ");
+
+      assert(mailID == event);
+      gaspi_notification_t value;
+      gaspiCheckReturn(gaspi_notify_reset(segmentID,
+                                          event,
+                                          &value),
+                      "gaspi_notify_reset failed with ");
+      return value != 0;
+    }
+
+    gaspi_segment_id_t mailBoxGaspi::getSegmentID() const {
+      return segmentID;
+    }
+
+    gaspi_notification_id_t mailBoxGaspi::getMailID() const {
+      return mailID;
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/lib/mailBoxGaspi.h b/src/gpi_comm_lib/collectives/lib/mailBoxGaspi.h
new file mode 100755
index 00000000..c33b6c86
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/mailBoxGaspi.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "mailBox.h"
+
+#include <GASPI.h>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    class mailBoxGaspi : public mailBox 
+    {
+      public:
+        mailBoxGaspi(const gaspi_segment_id_t segmentID_,
+                    const gaspi_notification_id_t mailID_);
+        bool gotNotification() override;
+        gaspi_segment_id_t getSegmentID() const;
+        gaspi_notification_id_t getMailID() const;
+
+      private:
+
+        const gaspi_segment_id_t segmentID;
+        const gaspi_notification_id_t mailID;
+    };
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/lib/mailBoxLocal.cpp b/src/gpi_comm_lib/collectives/lib/mailBoxLocal.cpp
new file mode 100755
index 00000000..352f7545
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/mailBoxLocal.cpp
@@ -0,0 +1,20 @@
+#include "mailBoxLocal.h"
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    mailBoxLocal::mailBoxLocal()
+    : status(0),
+      target(0) {}
+
+    bool mailBoxLocal::gotNotification() {
+      unsigned long statusOld = status;
+      return (statusOld < target) && status.compare_exchange_strong(statusOld, statusOld + 1);
+    }
+
+    void mailBoxLocal::notify() {
+      ++target;
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/lib/mailBoxLocal.h b/src/gpi_comm_lib/collectives/lib/mailBoxLocal.h
new file mode 100755
index 00000000..0d9b34fa
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/mailBoxLocal.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "mailBox.h"
+#include <atomic>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    class mailBoxLocal : public mailBox 
+    {
+      public:
+        mailBoxLocal();
+        bool gotNotification() override;
+        void notify();
+
+      private:
+        std::atomic<unsigned long> status;
+        std::atomic<unsigned long> target;
+    };
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/lib/queues.cpp b/src/gpi_comm_lib/collectives/lib/queues.cpp
new file mode 100755
index 00000000..d8feb3cb
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/queues.cpp
@@ -0,0 +1,59 @@
+#include "queues.h"
+#include "gpi/gaspiCheckReturn.hpp"
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    using tarantella::GPI::gaspiCheckReturn;
+
+    queues::queues(const long num,
+                  const gaspi_queue_id_t first)
+    : numQueues(num)
+    , state(0) {
+      for (long i=first; i < first + num; i++) {
+        queueStock.push_back(i);
+      }
+    }
+
+    queues::queues(const std::vector<gaspi_queue_id_t>& queues_)
+      : numQueues(queues_.size()),
+        state(0),
+        queueStock(queues_) {
+    }
+
+    gaspi_queue_id_t queues::get() const {
+      return stateToQueue(state);
+    }
+
+    inline gaspi_queue_id_t queues::stateToQueue(const long state_) const {
+      return queueStock[state_ % numQueues];
+    }
+
+    gaspi_queue_id_t queues::swap(gaspi_queue_id_t badQueue) {
+      const long stateLocal = state;
+      const gaspi_queue_id_t queueLocal = stateToQueue(stateLocal);
+
+      if (queueLocal != badQueue) {
+        return queueLocal;
+      } else {
+        const long stateLocalNew = stateLocal + 1;
+        const gaspi_queue_id_t queueLocalNew = stateToQueue(stateLocalNew);
+
+        clearQueue(queueLocalNew);
+
+        const long stateBeforeSwap =
+          __sync_val_compare_and_swap(&state, stateLocal, stateLocalNew);
+
+        return (stateBeforeSwap == stateLocal)
+              ? queueLocalNew
+              : stateToQueue(stateBeforeSwap);
+      };
+    }
+
+    inline void queues::clearQueue(const gaspi_queue_id_t queue) {
+      gaspiCheckReturn(gaspi_wait(queue, GASPI_BLOCK),
+                      "Failed to clear queue with ");
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/lib/queues.h b/src/gpi_comm_lib/collectives/lib/queues.h
new file mode 100755
index 00000000..0679a7d0
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/queues.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <GASPI.h>
+#include <vector>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    class queues {
+    public:
+      queues(const long num = 2,
+            const gaspi_queue_id_t first = 0);
+      queues(const std::vector<gaspi_queue_id_t>& queues_);
+
+      gaspi_queue_id_t get() const;
+      gaspi_queue_id_t swap(gaspi_queue_id_t badQueue);
+
+    private:
+      inline gaspi_queue_id_t stateToQueue(const long) const;
+      inline void clearQueue(const gaspi_queue_id_t queue);
+
+      static const long CACHE_LINE_SIZE = 64;
+      const long numQueues;
+
+      char pad0 [CACHE_LINE_SIZE];
+      volatile long state;
+      char pad1 [CACHE_LINE_SIZE];
+
+      std::vector<gaspi_queue_id_t> queueStock;
+    };
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/lib/reduce.cpp b/src/gpi_comm_lib/collectives/lib/reduce.cpp
new file mode 100755
index 00000000..6e1ba9c8
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/reduce.cpp
@@ -0,0 +1,188 @@
+#include "reduce.h"
+
+#include <stdint.h>
+#include <stdexcept>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    namespace 
+    {
+      template <class T>
+      inline void add(const reduce::task& t) {
+        const T* const a = (const T*) t.source;
+        T* const b = (T*) t.destination;
+        const long n = t.len;
+
+        for (long i=0; i < n; i++) {
+          b[i] += a[i];
+        }
+    }
+
+    template <class T>
+    inline void average(const reduce::task& t) {
+      if (t.scaling > 1) {
+        const T* const a = (const T*) t.source;
+        T* const b = (T*) t.destination;
+        const long n = t.len;
+        const T s = t.scaling;
+
+        for (long i=0; i < n; i++) {
+          b[i] = (b[i] + a[i]) / s;
+        }
+      } else {
+        add<T>(t);
+      }
+    }
+
+    template <class T>
+    inline void averageopt(const reduce::task& t) {
+      if (t.scaling > 1) {
+        const T* const a = (const T*) t.source;
+        T* const b = (T*) t.destination;
+        const long n = t.len;
+        const T s = T(1) / T(t.scaling);
+
+        for (long i=0; i < n; i++) {
+          b[i] = (b[i] + a[i]) * s;
+        }
+      } else {
+        add<T>(t);
+      }
+    }
+
+    class reduce_float_sum : public reduce {
+    public:
+      void operator()(const task& t) const {
+        add<float>(t);
+      }
+    };
+
+    class reduce_float_average : public reduce {
+    public:
+      void operator()(const task& t) const {
+        averageopt<float>(t);
+      }
+    };
+
+    class reduce_double_sum : public reduce {
+    public:
+      void operator()(const task& t) const {
+        add<double>(t);
+      }
+    };
+
+    class reduce_double_average : public reduce {
+    public:
+      void operator()(const task& t) const {
+        averageopt<double>(t);
+      }
+    };
+
+    class reduce_int16_sum : public reduce {
+    public:
+      void operator()(const task& t) const {
+        add<int16_t>(t);
+      }
+    };
+
+    class reduce_int16_average : public reduce {
+    public:
+      void operator()(const task& t) const {
+        average<int16_t>(t);
+      }
+    };
+
+    class reduce_int32_sum : public reduce {
+    public:
+      void operator()(const task& t) const {
+        add<int32_t>(t);
+      }
+    };
+
+    class reduce_int32_average : public reduce {
+    public:
+      void operator()(const task& t) const {
+        average<int32_t>(t);
+      }
+    };
+    }
+
+    reduce * getReduce(const allreduce::dataType data,
+                      const allreduce::reductionType reduction) {
+      reduce* p = NULL;
+
+      switch (data) {
+      case allreduce::FLOAT:
+        switch (reduction) {
+        case allreduce::SUM:
+          p = new reduce_float_sum();
+          break;
+        case allreduce::AVERAGE:
+          p = new reduce_float_average();
+          break;
+        default:
+          break;
+        }
+        break;
+      case allreduce::DOUBLE:
+        switch (reduction) {
+        case allreduce::SUM:
+          p = new reduce_double_sum;
+          break;
+        case allreduce::AVERAGE:
+          p = new reduce_double_average;
+          break;
+        default:
+          break;
+        }
+        break;
+      case allreduce::INT16:
+        switch (reduction) {
+        case allreduce::SUM:
+          p = new reduce_int16_sum;
+          break;
+        case allreduce::AVERAGE:
+          p = new reduce_int16_average;
+          break;
+        default:
+          break;
+        }
+        break;
+      case allreduce::INT32:
+        switch (reduction) {
+        case allreduce::SUM:
+          p = new reduce_int32_sum;
+          break;
+        case allreduce::AVERAGE:
+          p = new reduce_int32_average;
+          break;
+        default:
+          break;
+        }
+        break;
+      default:
+        break;
+      };
+
+      if (p == NULL) {
+        throw std::runtime_error(
+          "Unsupported combination of data type and reduction type");
+      }
+
+      return p;
+    }
+
+    size_t getDataTypeSize(const allreduce::dataType d) {
+      const size_t sizes[allreduce::NUM_TYPE] = {
+        sizeof(float),
+        sizeof(double),
+        sizeof(int16_t),
+        sizeof(int32_t)
+      };
+
+      return sizes[d];
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/lib/reduce.h b/src/gpi_comm_lib/collectives/lib/reduce.h
new file mode 100755
index 00000000..e0b7ea91
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/reduce.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include "allreduce.h"
+
+#include <string.h>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    class reduce {
+    public:
+      struct task {
+        const void* source;
+        void* destination;
+        long len;
+        unsigned long scaling;
+        task(const void* s = NULL,
+            void* d = NULL,
+            long n = 0,
+            unsigned long sc = 0)
+        : source(s), destination(d), len(n), scaling(sc) {}
+      };
+
+      virtual void operator()(const task& t) const = 0;
+      virtual ~reduce() {}
+    };
+
+    reduce * getReduce(const allreduce::dataType data,
+                      const allreduce::reductionType reduction);
+
+    size_t getDataTypeSize(const allreduce::dataType d);
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/lib/writer.cpp b/src/gpi_comm_lib/collectives/lib/writer.cpp
new file mode 100755
index 00000000..ed5ad550
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/writer.cpp
@@ -0,0 +1,80 @@
+#include "writer.h"
+#include "gpi/gaspiCheckReturn.hpp"
+
+#include <string>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    const gaspi_size_t writer::MESSAGE_LENGTH_LIMIT = 0x40000000;
+
+    using tarantella::GPI::gaspiCheckReturn;
+
+    writer::transferParameters::transferParameters(
+      bool a,
+      gaspi_rank_t r,
+      gaspi_segment_id_t sl,
+      gaspi_offset_t ol,
+      gaspi_segment_id_t sr,
+      gaspi_offset_t orm,
+      gaspi_size_t sz,
+      gaspi_notification_id_t id)
+    : active(a),
+      rank(r),
+      segmentLocal(sl),
+      offsetLocal(ol),
+      segmentRemote(sr),
+      offsetRemote(orm),
+      size(sz),
+      notificationID(id)
+    {}
+
+    std::ostream& writer::transferParameters::report(std::ostream& s) const {
+      if (active) {
+        s << "rank " << rank
+          << " | sl " << long(segmentLocal)
+          << " ol " << offsetLocal
+          << " | sr " << long(segmentRemote)
+          << " or " << offsetRemote
+          << " ID " << notificationID
+          << " | sz " << size;
+      } else {
+        s << "idle";
+      }
+      return s;
+    }
+
+    writer::writer(queues& queues_)
+    : queueSource(queues_) {}
+
+    void writer::operator()(const transferParameters& p) {
+      if (!p.active) return;
+      //thread save? watch queue management!
+
+      if (p.size > MESSAGE_LENGTH_LIMIT) {
+        throw std::runtime_error("writer: message is too long");
+      }
+
+      gaspi_return_t err;
+      gaspi_queue_id_t queueLocal = queueSource.get();
+      while ((err = gaspi_write_notify(p.segmentLocal,
+                                      p.offsetLocal,
+                                      p.rank,
+                                      p.segmentRemote,
+                                      p.offsetRemote,
+                                      p.size,
+                                      p.notificationID,
+                                      1,
+                                      queueLocal,
+                                      GASPI_BLOCK))
+            != GASPI_SUCCESS) {
+        if (err == GASPI_QUEUE_FULL) {
+          queueLocal = queueSource.swap(queueLocal);
+        } else {
+          gaspiCheckReturn(err, "gaspi_write_notify failed with ");
+        }
+      }
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/collectives/lib/writer.h b/src/gpi_comm_lib/collectives/lib/writer.h
new file mode 100755
index 00000000..db580448
--- /dev/null
+++ b/src/gpi_comm_lib/collectives/lib/writer.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "queues.h"
+
+#include <GASPI.h>
+#include <ostream>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+  class writer {
+  public:
+    struct transferParameters {
+      bool active;
+      gaspi_rank_t rank;
+      gaspi_segment_id_t segmentLocal;
+      gaspi_offset_t offsetLocal;
+      gaspi_segment_id_t segmentRemote;
+      gaspi_offset_t offsetRemote;
+      gaspi_size_t size;
+      gaspi_notification_id_t notificationID;
+      transferParameters(
+        bool a = false,
+        gaspi_rank_t r = 0,
+        gaspi_segment_id_t sl = 0,
+        gaspi_offset_t ol = 0,
+        gaspi_segment_id_t sr = 0,
+        gaspi_offset_t orm = 0,
+        gaspi_size_t sz = 0,
+        gaspi_notification_id_t id = 0);
+      std::ostream& report(std::ostream& s) const;
+    };
+
+    writer(queues& queues_);
+    void operator()(const transferParameters& p);
+
+  private:
+
+    static const gaspi_size_t MESSAGE_LENGTH_LIMIT;
+
+    queues& queueSource;
+  };
+
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/distribution/GroupBuilder.hpp b/src/gpi_comm_lib/distribution/GroupBuilder.hpp
new file mode 100644
index 00000000..a3b78b8f
--- /dev/null
+++ b/src/gpi_comm_lib/distribution/GroupBuilder.hpp
@@ -0,0 +1,34 @@
+#pragma once
+
+#include "gpi/Context.hpp"
+#include "gpi/ResourceManager.hpp"
+
+#include <numeric>
+
+namespace tarantella
+{
+  namespace distribution
+  {
+    class DataParallelGroupBuilder
+    {
+      public:
+        DataParallelGroupBuilder(GPI::Context& context)
+        : context(context)
+        { }
+
+        GPI::Group const get_group()
+        {
+          auto& resource_manager = context.get_resource_manager();
+          auto const num_ranks = context.get_comm_size();
+
+          std::vector<GPI::Rank> all_ranks(num_ranks);
+          std::iota(all_ranks.begin(), all_ranks.end(), static_cast<GPI::Rank>(0));
+
+          return resource_manager.make_group(all_ranks);
+        }
+
+      private:
+        GPI::Context& context;
+    };
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/distribution/SegmentIDBuilder.cpp b/src/gpi_comm_lib/distribution/SegmentIDBuilder.cpp
new file mode 100644
index 00000000..0ddbac0f
--- /dev/null
+++ b/src/gpi_comm_lib/distribution/SegmentIDBuilder.cpp
@@ -0,0 +1,19 @@
+#include "SegmentIDBuilder.hpp"
+
+namespace tarantella
+{
+  namespace distribution
+  {
+    GPI::SegmentID DataParallelSegmentIDBuilder::segment_id = 0UL;
+
+    GPI::SegmentID DataParallelSegmentIDBuilder::get_segment_id()
+    {
+      return segment_id++;
+    }
+
+    GPI::SegmentID PipelineSegmentIDBuilder::get_segment_id(PipelineCommunicator::ConnectionID id)
+    {
+      return static_cast<GPI::SegmentID>(id);
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/distribution/SegmentIDBuilder.hpp b/src/gpi_comm_lib/distribution/SegmentIDBuilder.hpp
new file mode 100644
index 00000000..2b46dc73
--- /dev/null
+++ b/src/gpi_comm_lib/distribution/SegmentIDBuilder.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "PipelineCommunicator.hpp"
+
+namespace tarantella
+{
+  namespace distribution
+  {
+    class DataParallelSegmentIDBuilder
+    {
+      public:
+        DataParallelSegmentIDBuilder() = default;
+
+        GPI::SegmentID get_segment_id();
+
+      private:
+        static GPI::SegmentID segment_id;
+    };
+
+    class PipelineSegmentIDBuilder
+    {
+      public:
+        PipelineSegmentIDBuilder() = default;
+
+        GPI::SegmentID get_segment_id(PipelineCommunicator::ConnectionID id);
+    };
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/distribution/utilities.cpp b/src/gpi_comm_lib/distribution/utilities.cpp
new file mode 100644
index 00000000..6b383dc4
--- /dev/null
+++ b/src/gpi_comm_lib/distribution/utilities.cpp
@@ -0,0 +1,23 @@
+#include "utilities.hpp"
+
+#include <numeric>
+#include <stdexcept>
+
+namespace tarantella
+{
+  namespace distribution
+  {
+    std::size_t get_segment_size(std::vector<collectives::TensorInfo> const& DNN, double overhead_factor)
+    {
+      if(DNN.size() == 0)
+      {
+        throw std::logic_error("tarantella::get_segment_size: Empty DNN to SynchCommunicator provided");
+      }
+
+      auto add_tensor_size_in_bytes = [](auto sum, auto tensor_info){
+        return sum + (tensor_info.get_nelems() * getDataTypeSize(tensor_info.get_elem_type())); };
+      auto const partition_size = std::accumulate(DNN.begin(), DNN.end(), 0UL, add_tensor_size_in_bytes);
+      return overhead_factor * partition_size;
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/distribution/utilities.hpp b/src/gpi_comm_lib/distribution/utilities.hpp
new file mode 100644
index 00000000..7ecdf542
--- /dev/null
+++ b/src/gpi_comm_lib/distribution/utilities.hpp
@@ -0,0 +1,15 @@
+#pragma once
+
+#include "collectives/TensorInfo.hpp"
+
+#include <cstddef>
+#include <vector>
+
+namespace tarantella
+{
+  namespace distribution
+  {
+    std::size_t get_segment_size(std::vector<collectives::TensorInfo> const& DNN, double overhead_factor);
+  }
+}
+
diff --git a/src/gpi_comm_lib/gpi/CMakeLists.txt b/src/gpi_comm_lib/gpi/CMakeLists.txt
new file mode 100644
index 00000000..ff442578
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/CMakeLists.txt
@@ -0,0 +1,29 @@
+include (add_macros)
+
+set (GPIRESOURCES_SOURCES
+    ${SRC_DIR}/gpi_comm_lib/gpi/Context.cpp
+    ${SRC_DIR}/gpi_comm_lib/gpi/Group.cpp
+    ${SRC_DIR}/gpi_comm_lib/gpi/GroupManager.cpp
+    ${SRC_DIR}/gpi_comm_lib/gpi/NotificationManager.cpp
+    ${SRC_DIR}/gpi_comm_lib/gpi/QueueManager.cpp
+    ${SRC_DIR}/gpi_comm_lib/gpi/ResourceManager.cpp
+    ${SRC_DIR}/gpi_comm_lib/gpi/Segment.cpp
+    ${SRC_DIR}/gpi_comm_lib/gpi/SegmentBuffer.cpp
+    ${SRC_DIR}/gpi_comm_lib/gpi/SegmentManager.cpp
+    ${SRC_DIR}/gpi_comm_lib/gpi/gaspiCheckReturn.cpp
+)
+extended_add_library(NAME gpiresources
+            NAMESPACE tnt
+            TYPE SHARED
+            SOURCES
+                ${GPIRESOURCES_SOURCES}
+            LIBRARIES
+                optimized GPI2::GPI2
+                debug GPI2::GPI2dbg
+            INCLUDE_DIRECTORIES
+                ${SRC_DIR}/gpi_comm_lib/
+            INSTALL
+            INSTALL_DESTINATION
+                ${INSTALL_LIB_DIR}
+            POSITION_INDEPENDENT)
+
diff --git a/src/gpi_comm_lib/gpi/Context.cpp b/src/gpi_comm_lib/gpi/Context.cpp
new file mode 100644
index 00000000..bc1ccee1
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/Context.cpp
@@ -0,0 +1,92 @@
+#include "Context.hpp"
+
+#include "gpi/gaspiCheckReturn.hpp"
+#include "gpi/Group.hpp"
+#include "gpi/ResourceManager.hpp"
+
+#include <algorithm>
+#include <numeric>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    Context::Context()
+    : rank(0), comm_size(0)
+    {
+      gaspiCheckReturn(gaspi_proc_init(GASPI_BLOCK),
+                       "GPI library initialization");
+      gaspiCheckReturn(gaspi_proc_rank(&rank),
+                       "get rank");
+      gaspi_rank_t size; // gaspi_proc_num expects gaspi_rank_t
+      gaspiCheckReturn(gaspi_proc_num(&size),
+                       "get number of processes");
+      comm_size = size;
+    }
+
+    Context::~Context()
+    {
+      gaspiCheckReturn(gaspi_barrier(GASPI_GROUP_ALL, timeout_millis),
+                      "gaspi_barrier");
+      gaspiCheckReturn(gaspi_proc_term(GASPI_BLOCK),
+                       "GPI library finalize");
+    }
+
+    Rank Context::get_rank() const
+    {
+      return rank;
+    }
+
+    std::size_t Context::get_comm_size() const
+    {
+      return comm_size;
+    }
+
+    tarantella::GPI::ResourceManager& Context::get_resource_manager()
+    {
+      return tarantella::GPI::ResourceManager::get_instance(*this);
+    }
+
+    void Context::allocate_segment(SegmentID id, Group const& group, std::size_t total_size)
+    {
+      if (total_size == 0)
+      {
+        throw std::runtime_error("Context::allocate_segment : Cannot allocate segment of size zero");
+      }
+
+      if (!group.contains_rank(get_rank()))
+      {
+        throw std::runtime_error("Context::allocate_segment : Group does not contain rank");
+      }
+
+      gaspiCheckReturn(gaspi_segment_alloc(id, total_size, GASPI_MEM_UNINITIALIZED),
+                       "Context::allocate_segment : segment could not be allocated");
+      for (auto other_rank : group.get_ranks())
+      {
+        if (other_rank != get_rank())
+        {
+          gaspiCheckReturn(gaspi_segment_register(id, other_rank, GASPI_BLOCK),
+                           "Context::allocate_segment : segment could not be registered");
+        }
+      }
+    }
+
+    void Context::deallocate_segment(SegmentID id, Group const& group)
+    {
+      if (!group.contains_rank(get_rank()))
+      {
+        throw std::runtime_error("Context::deallocate_segment : Group does not contain rank");
+      }
+      gaspiCheckReturn(gaspi_segment_delete(id),
+                       "Context::deallocate_segment : segment could not be deleted");
+    }
+
+    void* Context::get_segment_pointer(SegmentID id) const
+    {
+      void* p;
+      gaspiCheckReturn(gaspi_segment_ptr(id, &p), "get pointer within segment");
+      return p;
+    }
+  }
+}
+
diff --git a/src/gpi_comm_lib/gpi/Context.hpp b/src/gpi_comm_lib/gpi/Context.hpp
new file mode 100644
index 00000000..4c37b904
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/Context.hpp
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "Types.hpp"
+
+#include <GASPI.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    class Group;
+    class ResourceManager;
+
+    class Context
+    {
+      public:
+
+        Context();
+        Context(Context const& other) = delete;
+        Context& operator=(Context const& other) = delete;
+        ~Context();
+
+        Rank get_rank() const;
+        std::size_t get_comm_size() const;
+        tarantella::GPI::ResourceManager& get_resource_manager();
+
+        void allocate_segment(SegmentID id, Group const&, std::size_t total_size);
+        void deallocate_segment(SegmentID id, Group const&);
+        void* get_segment_pointer(SegmentID id) const;
+
+      private:
+        Rank rank;
+        std::size_t comm_size;
+        size_t const timeout_millis = 1000;
+    };
+  }
+}
diff --git a/src/gpi_comm_lib/gpi/Group.cpp b/src/gpi_comm_lib/gpi/Group.cpp
new file mode 100644
index 00000000..80bbebb1
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/Group.cpp
@@ -0,0 +1,40 @@
+#include "Group.hpp"
+
+#include <GASPI.h>
+
+#include <algorithm>
+#include <numeric>
+#include <stdexcept>
+#include <vector>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    Group::Group(std::vector<Rank> const &ranks_to_add)
+    : ranks(ranks_to_add)
+    {
+      if (ranks.size() == 0)
+      {
+        throw std::runtime_error("Group: Cannot create empty group");
+      }
+      std::sort(ranks.begin(), ranks.end());
+    }
+
+    std::size_t Group::get_size() const
+    {
+      return ranks.size();
+    }
+
+    bool Group::contains_rank(Rank rank) const
+    {
+      auto const iter = std::find(ranks.begin(), ranks.end(), rank);
+      return iter != ranks.end();
+    }
+
+    std::vector<Rank> const& Group::get_ranks() const
+    {
+      return ranks;
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/gpi/Group.hpp b/src/gpi_comm_lib/gpi/Group.hpp
new file mode 100644
index 00000000..535d11d0
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/Group.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "Types.hpp"
+
+#include <GASPI.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    class Group
+    {
+      public:
+        Group(std::vector<Rank> const&);
+
+        std::size_t get_size() const;
+        bool contains_rank(Rank) const;
+        std::vector<Rank> const& get_ranks() const;
+
+      private:
+        std::vector<Rank> ranks;
+    };
+  }
+}
diff --git a/src/gpi_comm_lib/gpi/GroupManager.cpp b/src/gpi_comm_lib/gpi/GroupManager.cpp
new file mode 100644
index 00000000..693f24e7
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/GroupManager.cpp
@@ -0,0 +1,18 @@
+#include "GroupManager.hpp"
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    GPI::Group const GroupManager::create_group(std::vector<GPI::Rank> const& ranks)
+    {
+      groups.emplace_back(ranks);
+      return groups.back();
+    }
+
+    std::vector<GPI::Group> const& GroupManager::get_groups() const
+    { 
+      return groups;
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/gpi/GroupManager.hpp b/src/gpi_comm_lib/gpi/GroupManager.hpp
new file mode 100644
index 00000000..6962f473
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/GroupManager.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "Types.hpp"
+#include "Group.hpp"
+
+#include <memory>
+#include <vector>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    class GroupManager
+    {
+      public:
+        GroupManager() = default;
+        GroupManager(GroupManager const&) = delete;
+        GroupManager& operator=(GroupManager const&) = delete;
+        ~GroupManager() = default;
+
+        GPI::Group const create_group(std::vector<GPI::Rank> const&);
+        std::vector<GPI::Group> const& get_groups() const;
+
+      private:
+        std::vector<GPI::Group> groups;
+    };
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/gpi/NotificationManager.cpp b/src/gpi_comm_lib/gpi/NotificationManager.cpp
new file mode 100644
index 00000000..a38eb096
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/NotificationManager.cpp
@@ -0,0 +1,59 @@
+#include "gpi/NotificationManager.hpp"
+#include "gaspiCheckReturn.hpp"
+
+#include <GASPI.h>
+
+#include <stdexcept>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    namespace 
+    {
+      std::size_t get_number_available_notifications()
+      {
+        gaspi_number_t notifications_available;
+        gaspiCheckReturn(gaspi_notification_num(&notifications_available),
+                        "[NotificationManager::get_number_available_notifications()] GASPI:\
+                          Could not get number of available notifications");
+        return notifications_available;
+      }
+    }
+
+    NotificationManager::NotificationManager()
+    : max_notification_id(get_number_available_notifications()), next_notification_ids()
+    { }
+
+    void NotificationManager::register_segment(GPI::SegmentID id)
+    {
+      if(next_notification_ids.find(id) != next_notification_ids.end())
+      {
+        throw std::runtime_error("[NotificationManager::register_segment]:\
+                                  Segment already registered");
+      }
+      next_notification_ids[id] = 0UL;
+    }
+
+    NotificationManager::NotificationRange
+     NotificationManager::get_notification_range(GPI::SegmentID id, std::size_t size)
+    {
+      if(next_notification_ids.find(id) == next_notification_ids.end())
+      {
+        throw std::runtime_error("[NotificationManager::get_notification_range]:\
+                                  Segment not registered");
+      }
+
+      if(next_notification_ids[id] + size > max_notification_id)
+      {
+        throw std::runtime_error("[NotificationManager::get_notification_range]:\
+                                  Not enough notifications left");
+      }
+
+      NotificationManager::NotificationRange const range = {next_notification_ids[id],
+                                                            next_notification_ids[id] + size};
+      next_notification_ids[id] += size;
+      return range;
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/gpi/NotificationManager.hpp b/src/gpi_comm_lib/gpi/NotificationManager.hpp
new file mode 100644
index 00000000..00e05105
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/NotificationManager.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "Context.hpp"
+
+#include <cstddef>
+#include <unordered_map>
+#include <utility>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    class NotificationManager
+    {
+      public:
+        using NotificationID = std::size_t;
+        using NotificationRange = std::pair<NotificationID, NotificationID>;
+
+        NotificationManager();
+        NotificationManager(NotificationManager const&) = delete;
+        NotificationManager& operator=(NotificationManager const &) = delete;
+        ~NotificationManager() = default;
+
+        void register_segment(GPI::SegmentID);
+        NotificationRange get_notification_range(GPI::SegmentID, std::size_t);
+
+      private:
+        std::size_t const max_notification_id;
+        std::unordered_map<GPI::SegmentID, NotificationID> next_notification_ids;
+    };
+  } 
+} 
\ No newline at end of file
diff --git a/src/gpi_comm_lib/gpi/QueueManager.cpp b/src/gpi_comm_lib/gpi/QueueManager.cpp
new file mode 100644
index 00000000..2f7e16d1
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/QueueManager.cpp
@@ -0,0 +1,129 @@
+#include "QueueManager.hpp"
+
+#include "gpi/gaspiCheckReturn.hpp"
+
+#include <GASPI.h>
+
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <random>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    namespace
+    {
+      std::size_t get_slots_per_gaspi_queue()
+      {
+        gaspi_number_t slots;
+        gaspiCheckReturn(gaspi_queue_size_max(&slots),
+                        "[QueueManager::get_slots_per_gaspi_queue()] GASPI:\
+                         Error in gaspi_queue_size_max");
+        return slots;
+      }
+
+      std::size_t get_number_allocated_gaspi_queues()
+      {
+        gaspi_number_t number_queues;
+        gaspiCheckReturn(gaspi_queue_num(&number_queues),
+                         "[QueueManager::get_number_allocated_gaspi_queues()] GASPI:\
+                          Could not get number of allocated queues");
+        return static_cast<std::size_t>(number_queues);
+      }
+
+      std::size_t get_number_gaspi_queues()
+      {
+        std::size_t const number_queues_want_to_use = 10;
+        gaspi_number_t number_queues_allowed;
+        gaspiCheckReturn(gaspi_queue_max(&number_queues_allowed),
+                         "[QueueManager::get_number_gaspi_queues()] GASPI:\
+                          Could not get max number of queues");
+        return std::min(number_queues_want_to_use,
+                        static_cast<std::size_t>(number_queues_allowed));
+      }
+
+      auto queue_has_two_empty_slots(std::size_t total_slots)
+      {
+        return [total_slots](auto queue)
+        {
+          gaspi_number_t non_empty_slots;
+          gaspiCheckReturn(gaspi_queue_size(queue, &non_empty_slots),
+                          "[QueueManager::queue_has_two_empty_slots()] GASPI:\
+                           Error in gaspi_queue_size");
+          return total_slots >= non_empty_slots + 2;
+        };
+      }
+    }
+
+    QueueManager& QueueManager::get_instance()
+    {
+      static auto instance = new QueueManager();
+      return *instance;
+    }
+
+    QueueManager::QueueManager()
+    : num_preallocated_queues(get_number_allocated_gaspi_queues()),
+      gaspi_queues(get_number_gaspi_queues()),
+      slots_per_gaspi_queue(get_slots_per_gaspi_queue()),
+      rng(std::random_device()())
+    {
+      auto const end = std::min(gaspi_queues.size(), num_preallocated_queues);
+      std::iota(gaspi_queues.begin(), gaspi_queues.begin() + end, 0);
+
+      // allocate remaining queues
+      if (num_preallocated_queues < gaspi_queues.size())
+      {
+        auto const start_unallocated_queues_it = gaspi_queues.begin() + num_preallocated_queues;
+        auto const num_unallocated_queues = gaspi_queues.size() - num_preallocated_queues;
+        std::generate_n(start_unallocated_queues_it, num_unallocated_queues,
+                        []() {
+                          gaspi_queue_id_t q;
+                          gaspiCheckReturn(gaspi_queue_create(&q, GASPI_BLOCK),
+                                          "[QueueManager::QueueManager()] GASPI:\
+                                            Could not create queue");
+                          return q;
+                        });
+      }
+    }
+
+    QueueManager::~QueueManager()
+    {
+      wait_and_flush_queue();
+
+      // only delete the queues allocated by the manager
+      std::sort(gaspi_queues.begin(), gaspi_queues.end(), std::greater<QueueID>());
+      if (num_preallocated_queues < gaspi_queues.size())
+      {
+        for (auto q = gaspi_queues.begin(); q != gaspi_queues.end() - num_preallocated_queues; ++q)
+        {
+          gaspiCheckReturn(gaspi_queue_delete(*q),
+                           "[QueueManager::QueueManager()] GASPI: Could not delete queue");
+        }
+      }
+    }
+
+    QueueID QueueManager::get_queue_id_for_write_notify()
+    {
+      std::shuffle(gaspi_queues.begin(), gaspi_queues.end(), rng);
+      auto const valid_queue = std::find_if(gaspi_queues.begin(), gaspi_queues.end(),
+                                            queue_has_two_empty_slots(slots_per_gaspi_queue));
+      if(valid_queue != gaspi_queues.end()) return *valid_queue;
+      else return wait_and_flush_queue(gaspi_queues.front());
+    }
+
+    void QueueManager::wait_and_flush_queue()
+    {
+      for(auto q : gaspi_queues) wait_and_flush_queue(q);
+    }
+
+    QueueID QueueManager::wait_and_flush_queue(QueueID id)
+    {
+      gaspiCheckReturn(gaspi_wait(id, GASPI_BLOCK),
+                       "[QueueManager::wait_and_flush_queue()] GASPI:\
+                        Error while waiting on queue");
+      return id;
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/gpi/QueueManager.hpp b/src/gpi_comm_lib/gpi/QueueManager.hpp
new file mode 100644
index 00000000..3d14d6e5
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/QueueManager.hpp
@@ -0,0 +1,37 @@
+#pragma once
+
+#include "gpi/Types.hpp"
+
+#include <GASPI.h>
+
+#include <random>
+#include <vector>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    class QueueManager
+    {
+      public:
+        static QueueManager& get_instance();
+        QueueManager(QueueManager const&) = delete;
+        QueueManager& operator=(QueueManager const&) = delete;
+        ~QueueManager();
+
+        QueueID get_queue_id_for_write_notify();
+        void wait_and_flush_queue();
+
+      private:
+        QueueManager();
+        QueueID wait_and_flush_queue(QueueID);
+
+        // Assumption: the IDs of the preallocated queues are in the 
+        // [0, num_preallocated_queues-1) range
+        std::size_t const num_preallocated_queues;
+        std::vector<QueueID> gaspi_queues;
+        std::size_t const slots_per_gaspi_queue;
+        std::mt19937 rng;
+    };
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/gpi/ResourceManager.cpp b/src/gpi_comm_lib/gpi/ResourceManager.cpp
new file mode 100644
index 00000000..5f07902c
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/ResourceManager.cpp
@@ -0,0 +1,57 @@
+#include "ResourceManager.hpp"
+
+#include <algorithm>
+#include <stdexcept>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    ResourceManager& ResourceManager::get_instance(GPI::Context& context)
+    {
+      static auto instance = new ResourceManager(context);
+      return *instance;
+    }
+
+    ResourceManager::ResourceManager(GPI::Context& context)
+    : queueManager(GPI::QueueManager::get_instance()), 
+      groupManager(), notificationManager(), segmentManager(context)
+    { }
+
+    void ResourceManager::make_segment_resources(GPI::SegmentID id, GPI::Group const& group, std::size_t size)
+    {
+      segmentManager.create_segment(id, group, size);
+      notificationManager.register_segment(id);
+    }
+
+    GPI::Group const ResourceManager::make_group(std::vector<GPI::Rank> const& ranks)
+    {
+      return groupManager.create_group(ranks);
+    }
+
+    std::vector<GPI::Group> const& ResourceManager::get_groups() const
+    {
+      return groupManager.get_groups();
+    }
+
+    GPI::QueueID ResourceManager::get_queue_id_for_write_notify()
+    {
+      return queueManager.get_queue_id_for_write_notify();
+    }
+
+    void ResourceManager::wait_and_flush_queue()
+    {
+      queueManager.wait_and_flush_queue();
+    }
+
+    GPI::NotificationRange ResourceManager::get_notification_range(GPI::SegmentID id, std::size_t s)
+    {
+      return notificationManager.get_notification_range(id, s);
+    }
+
+    GPI::SegmentBuffer ResourceManager::get_buffer_of_size(GPI::SegmentID id, std::size_t s)
+    {
+      return segmentManager.get_buffer_of_size(id, s);
+    }
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/gpi/ResourceManager.hpp b/src/gpi_comm_lib/gpi/ResourceManager.hpp
new file mode 100644
index 00000000..ef8bc753
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/ResourceManager.hpp
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "gpi/Context.hpp"
+#include "gpi/GroupManager.hpp"
+#include "gpi/NotificationManager.hpp"
+#include "gpi/QueueManager.hpp"
+#include "gpi/SegmentManager.hpp"
+#include "gpi/SegmentBuffer.hpp"
+#include "gpi/Types.hpp"
+
+#include <GASPI.h>
+
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    class ResourceManager
+    {
+      public:
+        static ResourceManager &get_instance(GPI::Context &);
+        ResourceManager() = delete;
+        ResourceManager(ResourceManager const&) = delete;
+        ResourceManager& operator=(ResourceManager const&) = delete;
+        ~ResourceManager() = default;
+
+        void make_segment_resources(GPI::SegmentID, GPI::Group const&, std::size_t);
+        GPI::Group const make_group(std::vector<GPI::Rank> const&);
+        std::vector<GPI::Group> const& get_groups() const;
+        GPI::QueueID get_queue_id_for_write_notify();
+        void wait_and_flush_queue();
+        GPI::NotificationRange get_notification_range(GPI::SegmentID, std::size_t);
+        GPI::SegmentBuffer get_buffer_of_size(GPI::SegmentID, std::size_t);
+
+      private:
+        ResourceManager(GPI::Context&);
+  
+        GPI::QueueManager& queueManager;
+        GPI::GroupManager groupManager;
+        GPI::NotificationManager notificationManager;
+        GPI::SegmentManager segmentManager;
+    };
+  }
+}
diff --git a/src/gpi_comm_lib/gpi/Segment.cpp b/src/gpi_comm_lib/gpi/Segment.cpp
new file mode 100644
index 00000000..257a636c
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/Segment.cpp
@@ -0,0 +1,39 @@
+#include "Segment.hpp"
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    Segment::Segment(Context& context,
+                     Group const& group,
+                     SegmentID id,
+                     std::size_t size):
+      context(context), group(group), id(id),
+      size(size), ptr(nullptr)
+    {
+      context.allocate_segment(id, group, size);
+      ptr = context.get_segment_pointer(id);
+    }
+
+    Segment::~Segment()
+    {
+      context.deallocate_segment(id, group);
+    }
+
+    SegmentID Segment::get_id() const
+    {
+      return id;
+    }
+
+    std::size_t Segment::get_size() const
+    {
+      return size;
+    }
+
+    void* Segment::get_ptr() const
+    {
+      return ptr;
+    }
+  }
+}
+
diff --git a/src/gpi_comm_lib/gpi/Segment.hpp b/src/gpi_comm_lib/gpi/Segment.hpp
new file mode 100644
index 00000000..f3ebed12
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/Segment.hpp
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "Context.hpp"
+#include "Group.hpp"
+
+#include <GASPI.h>
+
+#include <cstddef>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    class Segment
+    {
+      public:
+
+        Segment(Context& context, Group const&, SegmentID, std::size_t );
+        Segment(Segment const& other) = delete;
+        Segment& operator=(Segment const& other) = delete;
+        Segment(Segment&& other) = delete;
+        Segment& operator=(Segment&& other) = delete;
+        ~Segment();
+
+        std::size_t get_size() const;
+        SegmentID get_id() const;
+        void* get_ptr() const;
+
+      private:
+
+        Context& context;
+        Group const group;
+
+        SegmentID const id;
+        std::size_t const size;
+        void* /* const */ ptr;
+    };
+  }
+}
+
diff --git a/src/gpi_comm_lib/gpi/SegmentBuffer.cpp b/src/gpi_comm_lib/gpi/SegmentBuffer.cpp
new file mode 100644
index 00000000..e2539a1d
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/SegmentBuffer.cpp
@@ -0,0 +1,21 @@
+
+#include "SegmentBuffer.hpp"
+
+#include <cstddef>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    SegmentBuffer::SegmentBuffer(GPI::Segment const& s, std::size_t offset, std::size_t size)
+    : id(s.get_id()), offset(offset), size(size),
+      ptr(reinterpret_cast<std::byte*>(s.get_ptr()) + offset)
+    { }
+
+    SegmentID SegmentBuffer::get_segment_id() const { return id; }
+    std::size_t SegmentBuffer::get_size() const { return size; }
+    std::size_t SegmentBuffer::get_offset() const { return offset; }
+    void* SegmentBuffer::get_ptr() const { return ptr; }
+
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/gpi/SegmentBuffer.hpp b/src/gpi_comm_lib/gpi/SegmentBuffer.hpp
new file mode 100644
index 00000000..e0451286
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/SegmentBuffer.hpp
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "Segment.hpp"
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    class SegmentBuffer
+    {
+      public:
+        SegmentBuffer(GPI::Segment const &s, std::size_t offset, std::size_t size);
+        SegmentBuffer(SegmentBuffer const& other) = default;
+        SegmentBuffer& operator=(SegmentBuffer const&) = delete;
+        SegmentBuffer(SegmentBuffer&&) = default;
+        SegmentBuffer& operator=(SegmentBuffer&&) = delete;
+        ~SegmentBuffer() = default;
+
+        SegmentID get_segment_id() const;
+        std::size_t get_size() const;
+        std::size_t get_offset() const;
+        void* get_ptr() const;
+
+      private:
+        SegmentID const id;
+        std::size_t const offset;
+        std::size_t const size;
+        void* const ptr;
+    };
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/gpi/SegmentManager.cpp b/src/gpi_comm_lib/gpi/SegmentManager.cpp
new file mode 100644
index 00000000..239ae493
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/SegmentManager.cpp
@@ -0,0 +1,45 @@
+#include "SegmentManager.hpp"
+
+#include <memory>
+#include <stdexcept>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    SegmentManager::SegmentManager(GPI::Context& context)
+    : context(context), segments()
+    { }
+
+    void SegmentManager::create_segment(GPI::SegmentID id, GPI::Group const& group, std::size_t size)
+    {
+      if(segments.find(id) != segments.end())
+      {
+        throw std::runtime_error("[SegmentManager::create_segment]:\
+                                  Segment already exists");
+      }
+      segments.emplace(std::make_pair(id, AllocatedSegment(context, group, id, size, 0UL)));
+    }
+
+    SegmentBuffer SegmentManager::get_buffer_of_size(GPI::SegmentID id, std::size_t buffer_size)
+    {
+      if(segments.find(id) == segments.end())
+      {
+        throw std::runtime_error("[SegmentManager::get_buffer_of_size]:\
+                                  Segment not allocated");
+      }
+
+      auto& segment = segments.at(id).segment;
+      auto const current_offset = segments.at(id).current_offset;
+      if(current_offset + buffer_size > segment->get_size())
+      {
+        throw std::runtime_error("[SegmentManager::get_buffer_of_size]:\
+                                  Out of memory");
+      }
+
+      SegmentBuffer const segmentBuffer(*segment, current_offset, buffer_size);
+      segments.at(id).current_offset = current_offset + buffer_size;
+      return segmentBuffer;
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/gpi/SegmentManager.hpp b/src/gpi_comm_lib/gpi/SegmentManager.hpp
new file mode 100644
index 00000000..67945e14
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/SegmentManager.hpp
@@ -0,0 +1,47 @@
+#pragma once
+
+#include "Context.hpp"
+#include "Segment.hpp"
+#include "SegmentBuffer.hpp"
+
+#include <cstddef>
+#include <memory>
+#include <unordered_map>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    class SegmentManager
+    {
+      public:
+        SegmentManager(GPI::Context&);
+        SegmentManager() = delete;
+        SegmentManager(SegmentManager const&) = delete;
+        SegmentManager& operator=(SegmentManager const&) = delete;
+        ~SegmentManager() = default;
+
+        void create_segment(GPI::SegmentID, GPI::Group const&, std::size_t);
+        SegmentBuffer get_buffer_of_size(GPI::SegmentID, std::size_t);
+
+      private:
+        class AllocatedSegment
+        {
+          public:
+            AllocatedSegment(GPI::Context& context, GPI::Group const& group, 
+                             GPI::SegmentID id, std::size_t size, std::size_t offset)
+            : segment(std::make_unique<GPI::Segment>(context, group, id, size)),
+              current_offset(offset)
+            {}
+            AllocatedSegment(AllocatedSegment&&) = default;
+            AllocatedSegment& operator=(AllocatedSegment&&) = default;
+
+            std::unique_ptr<GPI::Segment> segment;
+            std::size_t current_offset;
+        };
+
+        GPI::Context& context;
+        std::unordered_map<GPI::SegmentID, AllocatedSegment> segments;
+    };
+  }
+}
diff --git a/src/gpi_comm_lib/gpi/Types.hpp b/src/gpi_comm_lib/gpi/Types.hpp
new file mode 100644
index 00000000..bd331258
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/Types.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <cstddef>
+#include <utility>
+
+#include <GASPI.h>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    using Rank = short unsigned int;
+    using SegmentID = unsigned char;
+    using GroupID = unsigned long;
+
+    using NotificationID = std::size_t;
+    using NotificationRange = std::pair<NotificationID, NotificationID>;
+    using QueueID = gaspi_queue_id_t;
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/gpi/gaspiCheckReturn.cpp b/src/gpi_comm_lib/gpi/gaspiCheckReturn.cpp
new file mode 100755
index 00000000..a7a6139f
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/gaspiCheckReturn.cpp
@@ -0,0 +1,24 @@
+#include "gaspiCheckReturn.hpp"
+
+#include <string>
+#include <cstdlib>
+#include <stdexcept>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    void gaspiCheckReturn(const gaspi_return_t err,
+                          const std::string prefix)
+    {
+      if (err != GASPI_SUCCESS)
+      {
+        gaspi_string_t raw;
+        gaspi_print_error(err, &raw);
+        std::string message = prefix + std::string(raw);
+        free(raw);
+        throw std::runtime_error(message);
+      }
+    }
+  }
+}
diff --git a/src/gpi_comm_lib/gpi/gaspiCheckReturn.hpp b/src/gpi_comm_lib/gpi/gaspiCheckReturn.hpp
new file mode 100755
index 00000000..4cd4cc35
--- /dev/null
+++ b/src/gpi_comm_lib/gpi/gaspiCheckReturn.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <GASPI.h>
+
+#include <string>
+#include <cstdlib>
+#include <stdexcept>
+
+namespace tarantella
+{
+  namespace GPI
+  {
+    void gaspiCheckReturn(const gaspi_return_t err,
+                          const std::string prefix);
+  }
+}
\ No newline at end of file
diff --git a/src/gpi_comm_lib/pybind11_wrappers.cpp b/src/gpi_comm_lib/pybind11_wrappers.cpp
new file mode 100644
index 00000000..c1b85ae9
--- /dev/null
+++ b/src/gpi_comm_lib/pybind11_wrappers.cpp
@@ -0,0 +1,155 @@
+#include "collectives/BufferElementType.hpp"
+#include "collectives/TensorInfo.hpp"
+#include "distribution/GroupBuilder.hpp"
+#include "distribution/SegmentIDBuilder.hpp"
+#include "gpi/Context.hpp"
+#include "PipelineCommunicator.hpp"
+#include "SynchCommunicator.hpp"
+#include "TensorBroadcaster.hpp"
+
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+#include <cstdint>
+#include <utility>
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(GPICommLib, m)
+{
+  m.doc() = "GPI communication library for Deep Learning";
+
+  py::class_<tarantella::GPI::Context>(m, "GPIContext")
+      .def(py::init<>())
+      .def_property_readonly("rank", &tarantella::GPI::Context::get_rank)
+      .def_property_readonly("size", &tarantella::GPI::Context::get_comm_size);
+
+  py::class_<tarantella::collectives::TensorInfo>(m, "TensorInfo")
+    .def(py::init(
+        [](std::size_t tensid, std::size_t nelems, py::dtype tensdtype)
+        {
+          tarantella::collectives::BufferElementType elemtype;
+          if (tensdtype.is(py::dtype::of<float>()))
+          {
+            elemtype = tarantella::collectives::BufferElementType::FLOAT;
+          }
+          else if (tensdtype.is(py::dtype::of<int32_t>()))
+          {
+            elemtype = tarantella::collectives::BufferElementType::INT32;
+          }
+          else if (tensdtype.is(py::dtype::of<std::int16_t>()))
+          {
+            elemtype = tarantella::collectives::BufferElementType::INT16;
+          }
+          else
+          {
+            throw std::runtime_error("[Pybind11][TensorInfo] Unknown buffer type");
+          }
+          return std::unique_ptr<tarantella::collectives::TensorInfo>(
+                             new tarantella::collectives::TensorInfo(tensid, nelems, elemtype));
+        }));
+
+  py::class_<tarantella::SynchCommunicator>(m, "SynchDistCommunicator")
+    .def(py::init(
+        [](tarantella::GPI::Context& context,
+           std::vector<tarantella::collectives::TensorInfo> tensor_infos,
+           std::size_t fusion_threshold_bytes)
+        {
+          tarantella::distribution::DataParallelGroupBuilder group_builder(context);
+          tarantella::distribution::DataParallelSegmentIDBuilder segment_id_builder{};
+
+          return std::unique_ptr<tarantella::SynchCommunicator>(
+            new tarantella::SynchCommunicator(context,
+                                              segment_id_builder.get_segment_id(),
+                                              group_builder.get_group(),
+                                              tensor_infos,
+                                              fusion_threshold_bytes));
+        }),
+        // ensure the `context` object is not garbage-collected as long as the SynchCommunicator is alive
+        py::keep_alive<1, 2>())
+    .def("get_raw_ptr", [](tarantella::SynchCommunicator& d) 
+        {
+          return reinterpret_cast<uint64_t>(&d);
+        },
+        py::return_value_policy::reference_internal);
+
+  py::class_<tarantella::TensorBroadcaster>(m, "TensorBroadcaster")
+    .def(py::init(
+        [](tarantella::GPI::Context& context,
+           std::vector<tarantella::collectives::TensorInfo> tensor_infos,
+           tarantella::GPI::Rank root_rank)
+        {
+          tarantella::distribution::DataParallelGroupBuilder group_builder(context);
+          tarantella::distribution::DataParallelSegmentIDBuilder segment_id_builder{};
+
+          return std::unique_ptr<tarantella::TensorBroadcaster>(
+            new tarantella::TensorBroadcaster(context,
+                                              segment_id_builder.get_segment_id(),
+                                              group_builder.get_group(),
+                                              tensor_infos,
+                                              root_rank));
+        }),
+        py::keep_alive<1, 2>())
+    .def("broadcast",
+        [](tarantella::TensorBroadcaster &tb, std::vector<py::array>& tensor_list)
+        {
+          std::vector<void*> tensor_ptrs;
+          for (auto& tens : tensor_list)
+          {
+            py::buffer_info info = tens.request(); 
+            tensor_ptrs.emplace_back(info.ptr);
+          }
+          tb.exec_broadcast(tensor_ptrs);
+        });
+
+  py::class_<tarantella::collectives::Barrier::GPIBarrierAllRanks>(m, "Barrier")
+    .def(py::init(
+        [](tarantella::GPI::Context&)
+        {
+          return std::unique_ptr<tarantella::collectives::Barrier::GPIBarrierAllRanks>(
+            new tarantella::collectives::Barrier::GPIBarrierAllRanks());
+        }),
+        py::keep_alive<1, 2>())
+    .def("blocking_barrier_all_ranks",
+        [](tarantella::collectives::Barrier::GPIBarrierAllRanks &barrier)
+        {
+          barrier.blocking_barrier();
+        });
+
+  py::class_<tarantella::PipelineCommunicator>(m, "PipelineCommunicator")
+    .def(py::init(
+        [](tarantella::GPI::Context& context, 
+           std::unordered_map<tarantella::PipelineCommunicator::ConnectionID,
+                              std::pair<std::pair<tarantella::GPI::Rank, tarantella::GPI::Rank>, std::size_t>> edges,
+           std::size_t num_micro_batches)
+        {
+          auto const rank = context.get_rank();
+          std::unordered_map<tarantella::PipelineCommunicator::ConnectionID,
+                             tarantella::ConnectionInfo> conn_infos;
+          tarantella::distribution::PipelineSegmentIDBuilder segment_id_builder;
+
+          // build connection info (segment_id, other rank, buffer_size)
+          // for each edge connected to the current rank
+          for (auto const& [conn_id, edge_and_size] : edges)
+          {
+            auto const ranks = edge_and_size.first;
+            if (ranks.first != rank && ranks.second != rank) continue;
+            
+            auto const other_rank = (ranks.first == rank) ? ranks.second : ranks.first;
+            auto const buffer_size = edge_and_size.second;
+            auto const segment_id = segment_id_builder.get_segment_id(conn_id);
+            tarantella::ConnectionInfo const conn_info(segment_id, other_rank, buffer_size);
+            conn_infos.emplace(conn_id, conn_info);
+          }
+
+          return std::make_unique<tarantella::PipelineCommunicator>(context, conn_infos, num_micro_batches);
+        }),
+        // ensure the `context` object is not garbage-collected as long as the PipelineCommunicator is alive
+        py::keep_alive<1, 2>())
+    .def("get_raw_ptr", [](tarantella::PipelineCommunicator& comm) 
+        {
+          return reinterpret_cast<uint64_t>(&comm);
+        },
+        py::return_value_policy::reference_internal);
+}
diff --git a/src/gpi_comm_lib/tf_ops/AllreduceOps.cpp b/src/gpi_comm_lib/tf_ops/AllreduceOps.cpp
new file mode 100644
index 00000000..74bf92ad
--- /dev/null
+++ b/src/gpi_comm_lib/tf_ops/AllreduceOps.cpp
@@ -0,0 +1,125 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+#include "SynchCommunicator.hpp"
+
+using namespace tensorflow;
+
+REGISTER_OP("StartAllreduceOp")
+    .Attr("tnt_synchcomm: int")
+    .Attr("tensor_id: int")
+    .Input("input_tensor: float")
+    .Output("out_tensor: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+REGISTER_OP("FinishAllreduceOp")
+    .Attr("tnt_synchcomm: int")
+    .Attr("tensor_id: int")
+    .Attr("Tout: type")
+    .Input("input_tensor: float")
+    .Output("out_tensor: Tout")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+REGISTER_OP("BarrierOp")
+    .Attr("T: list(type)")
+    .Attr("Tout: list(type)")
+    .Input("in: T")
+    .Output("out: Tout")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) 
+    {       
+      for (auto i = 0; i < c->num_outputs(); ++i)
+      {
+        c->set_output(i, c->input(i));
+      }
+      return Status::OK();
+    });
+
+template <typename T>
+class CommunicateTensorOp : public OpKernel
+{
+  public:
+    explicit CommunicateTensorOp(OpKernelConstruction* context)
+        : OpKernel(context)
+    {
+      tensorflow::int64 context_ptr;
+      OP_REQUIRES_OK(context,
+                    context->GetAttr("tnt_synchcomm", &context_ptr));
+      synch_communicator = reinterpret_cast<tarantella::SynchCommunicator *>(context_ptr);
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("tensor_id", &tensor_id));
+    }
+
+    void Compute(OpKernelContext* context) override
+    {
+      static_cast<T&>(*this).compute_impl(context);
+    }
+
+  protected:
+    tensorflow::int64 tensor_id;
+    tarantella::SynchCommunicator *synch_communicator;
+};
+
+class StartAllreduceOp : public CommunicateTensorOp<StartAllreduceOp>
+{
+  public:
+    explicit StartAllreduceOp(OpKernelConstruction* context)
+    : CommunicateTensorOp<StartAllreduceOp>(context)
+    { }
+
+    void compute_impl(OpKernelContext* context)
+    {
+      auto input_index = 0;
+      auto output_index = 0;
+      const Tensor &input_tensor = context->input(input_index);
+      auto* input_flat = input_tensor.flat<float>().data();
+
+      synch_communicator->start_allreduce_impl(tensor_id, input_flat);
+      context->set_output(output_index, input_tensor);
+    }
+    
+};
+
+class FinishAllreduceOp : public CommunicateTensorOp<FinishAllreduceOp>
+{
+  public:
+    explicit FinishAllreduceOp(OpKernelConstruction* context)
+    : CommunicateTensorOp<FinishAllreduceOp>(context)
+    { }
+
+    void compute_impl(OpKernelContext *context)
+    {
+      auto input_index = 0;
+      auto output_index = 0;
+      const Tensor &input_tensor = context->input(input_index);
+      Tensor* output_tensor = nullptr;
+      OP_REQUIRES_OK(context, context->allocate_output(output_index, input_tensor.shape(),
+                                                       &output_tensor));
+      auto* output_flat = output_tensor->flat<float>().data();
+      synch_communicator->finish_allreduce_impl(tensor_id, output_flat);
+    }
+};
+
+class BarrierOp : public OpKernel
+{
+  public:
+    explicit BarrierOp(OpKernelConstruction* context)
+    : OpKernel(context)
+    {}
+
+    void Compute(OpKernelContext* context) override
+    {
+      for (auto i = 0; i < context->num_outputs(); ++i)
+      {
+        context->set_output(i, context->input(i));
+      }
+    }
+};
+
+REGISTER_KERNEL_BUILDER(Name("StartAllreduceOp").Device(DEVICE_CPU), StartAllreduceOp);
+REGISTER_KERNEL_BUILDER(Name("FinishAllreduceOp").Device(DEVICE_CPU), FinishAllreduceOp);
+REGISTER_KERNEL_BUILDER(Name("BarrierOp").Device(DEVICE_CPU), BarrierOp);
\ No newline at end of file
diff --git a/src/gpi_comm_lib/tf_ops/CMakeLists.txt b/src/gpi_comm_lib/tf_ops/CMakeLists.txt
new file mode 100644
index 00000000..4f2c5a74
--- /dev/null
+++ b/src/gpi_comm_lib/tf_ops/CMakeLists.txt
@@ -0,0 +1,26 @@
+
+set(TFOPS_SOURCES
+    ${SRC_DIR}/gpi_comm_lib/tf_ops/AllreduceOps.cpp
+    ${SRC_DIR}/gpi_comm_lib/tf_ops/SendRecvOps.cpp
+)
+
+set(TFOPS_BUILD_DIR ${CMAKE_BINARY_DIR}/tnt_tfops)
+set(TFOPS_LOADER_DIR
+    ${SRC_DIR}/gpi_comm_lib/tf_ops/tnt_tfops)
+
+add_custom_target(tfops-loader ALL
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${TFOPS_LOADER_DIR} ${TFOPS_BUILD_DIR})
+
+extended_add_library(NAME tfops
+            NAMESPACE tnt
+            TYPE SHARED
+            SOURCES
+                ${TFOPS_SOURCES}
+            LIBRARIES
+                tnt::gpicommlib
+                Tensorflow::Tensorflow
+            INSTALL
+            INSTALL_DESTINATION
+                ${INSTALL_LIB_DIR}
+            POSITION_INDEPENDENT)
+set_property(TARGET tnt-tfops PROPERTY CXX_STANDARD 14)
diff --git a/src/gpi_comm_lib/tf_ops/SendRecvOps.cpp b/src/gpi_comm_lib/tf_ops/SendRecvOps.cpp
new file mode 100644
index 00000000..ec5a5c1c
--- /dev/null
+++ b/src/gpi_comm_lib/tf_ops/SendRecvOps.cpp
@@ -0,0 +1,103 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+#include "PipelineCommunicator.hpp"
+
+using namespace tensorflow;
+
+REGISTER_OP("SendOp")
+    .Attr("tnt_pipeline_comm: int")
+    .Input("input_tensor: float")
+    .Input("connection_id: int32")
+    .Input("micro_batch_id: int32")
+    .Output("out_tensor: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) 
+    {
+      c->set_output(0, c->input(0)); 
+      return Status::OK();
+    });
+REGISTER_OP("RecvOp")
+    .Attr("tnt_pipeline_comm: int")
+    .Input("input_tensor: float")
+    .Input("connection_id: int32")
+    .Input("micro_batch_id: int32")
+    .Output("out_tensor: float")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) 
+    {
+      c->set_output(0, c->input(0)); 
+      return Status::OK();
+    });
+
+class SendOp : public OpKernel
+{
+  public:
+    explicit SendOp(OpKernelConstruction* context)
+    : OpKernel(context)
+    {
+      tensorflow::int64 context_ptr;
+      OP_REQUIRES_OK(context, context->GetAttr("tnt_pipeline_comm", &context_ptr));
+      pipeline_communicator = reinterpret_cast<tarantella::PipelineCommunicator*>(context_ptr);
+    }
+
+    void Compute(OpKernelContext* context) override
+    {
+      const Tensor& input_tensor = context->input(0);
+      const Tensor& conn_id_tensor = context->input(1);
+      const Tensor& micro_batch_id_tensor = context->input(2);
+
+      auto send_buf = reinterpret_cast<void*>(const_cast<float*>(input_tensor.flat<float>().data()));
+      auto const conn_id = static_cast<tarantella::PipelineCommunicator::ConnectionID>(
+                                                  conn_id_tensor.flat<int>().data()[0]);
+      auto const micro_batch_id = static_cast<tarantella::PipelineCommunicator::MicrobatchID>(
+                                                  micro_batch_id_tensor.flat<int>().data()[0]);
+
+      // allocate (fake) output
+      auto const output_index = 0;
+      Tensor* output_tensor = nullptr;
+      OP_REQUIRES_OK(context, context->allocate_output(output_index, input_tensor.shape(), &output_tensor));
+
+      pipeline_communicator->non_blocking_send(send_buf, conn_id, micro_batch_id);
+    }
+
+  private:
+    tarantella::PipelineCommunicator *pipeline_communicator;
+};
+
+class RecvOp : public OpKernel
+{
+  public:
+    explicit RecvOp(OpKernelConstruction* context)
+    : OpKernel(context)
+    {
+      tensorflow::int64 context_ptr;
+      OP_REQUIRES_OK(context, context->GetAttr("tnt_pipeline_comm", &context_ptr));
+      pipeline_communicator = reinterpret_cast<tarantella::PipelineCommunicator *>(context_ptr);
+    }
+
+    void Compute(OpKernelContext* context) override
+    {
+      const Tensor& input_tensor = context->input(0);
+      const Tensor& conn_id_tensor = context->input(1);
+      const Tensor& micro_batch_id_tensor = context->input(2);
+
+      auto const conn_id = static_cast<tarantella::PipelineCommunicator::ConnectionID>(
+                                                  conn_id_tensor.flat<int>().data()[0]);
+      auto const micro_batch_id = static_cast<tarantella::PipelineCommunicator::MicrobatchID>(
+                                                  micro_batch_id_tensor.flat<int>().data()[0]);
+
+      // allocate output
+      auto const output_index = 0;
+      Tensor* output_tensor = nullptr;
+      OP_REQUIRES_OK(context, context->allocate_output(output_index, input_tensor.shape(), &output_tensor));
+
+      auto* recv_buf = output_tensor->flat<float>().data();
+      pipeline_communicator->blocking_recv(recv_buf, conn_id, micro_batch_id);
+  }
+
+  private:
+    tarantella::PipelineCommunicator *pipeline_communicator;
+};
+
+REGISTER_KERNEL_BUILDER(Name("SendOp").Device(DEVICE_CPU), SendOp);
+REGISTER_KERNEL_BUILDER(Name("RecvOp").Device(DEVICE_CPU), RecvOp);
\ No newline at end of file
diff --git a/src/gpi_comm_lib/tf_ops/tnt_tfops/__init__.py b/src/gpi_comm_lib/tf_ops/tnt_tfops/__init__.py
new file mode 100644
index 00000000..4ea68b87
--- /dev/null
+++ b/src/gpi_comm_lib/tf_ops/tnt_tfops/__init__.py
@@ -0,0 +1,6 @@
+import tensorflow as tf
+import pathlib
+import os
+
+tnt_ops = tf.load_op_library('libtnt-tfops.so')
+
diff --git a/src/runtime/__init__.py b/src/runtime/__init__.py
new file mode 100644
index 00000000..ffbcbf98
--- /dev/null
+++ b/src/runtime/__init__.py
@@ -0,0 +1,2 @@
+import logging
+logger = logging.getLogger("TNT_CLI")
diff --git a/src/runtime/environment_config.py b/src/runtime/environment_config.py
new file mode 100644
index 00000000..d66b2edc
--- /dev/null
+++ b/src/runtime/environment_config.py
@@ -0,0 +1,51 @@
+import os
+import sys
+
+import runtime.tnt_config as tnt_config
+from runtime.tnt_config import TNTConfig
+
+def get_tnt_variables_from_args(args):
+  tnt_vars = {TNTConfig.TNT_LOG_LEVEL.name : args.log_level,
+              TNTConfig.TNT_LOG_ON_ALL_DEVICES.name : str(args.log_all),
+              TNTConfig.TNT_OUTPUT_ON_ALL_DEVICES.name : str(args.output_all)}
+
+  if args.fusion_threshold_kb is not None:
+    tnt_vars[TNTConfig.TNT_FUSION_THRESHOLD.name] = int(args.fusion_threshold_kb) * 1024
+  return tnt_vars
+
+def get_tnt_gpus(gpus_per_node):
+  return {TNTConfig.TNT_GPUS_PER_NODE.name : gpus_per_node}
+          
+def update_environment_paths(libraries_path):
+  os.environ["PYTHONPATH"]=os.pathsep.join(sys.path)
+
+  for var_name in ["LD_LIBRARY_PATH", "DYLD_LIBRARY_PATH"]:
+    os.environ[var_name] = os.pathsep.join([libraries_path,
+                                            os.environ.get(var_name, "")])
+
+def collect_environment_variables():
+  env = {}
+  for var in ['PATH', 'PYTHONPATH', 'LD_LIBRARY_PATH', 'DYLD_LIBRARY_PATH']:
+    if var in os.environ:
+      env[var] = os.environ[var]
+  return env
+
+def collect_tensorflow_variables():
+  env = {}
+  for var, value in os.environ.items():
+    if var.lower().startswith("tf_"):
+      env[var] = value
+  return env
+
+def collect_tarantella_variables():
+  env = {}
+  for var, value in os.environ.items():
+    if var.startswith(tnt_config.TARANTELLA_ENV_VAR_PREFIX):
+      env[var] = value
+  return env
+
+def gen_exports_from_dict(env_dict):
+  environment = ""
+  for var_name,value in env_dict.items():
+    environment += "export {}={}\n".format(var_name, value)
+  return environment
diff --git a/src/runtime/file_management.py b/src/runtime/file_management.py
new file mode 100644
index 00000000..85eed66b
--- /dev/null
+++ b/src/runtime/file_management.py
@@ -0,0 +1,61 @@
+from abc import ABCMeta, abstractmethod
+import os
+import stat
+import tempfile
+
+def make_executable(filename):
+  os.chmod(filename, os.stat(filename).st_mode | stat.S_IXUSR)
+
+class TemporaryFileWrapper(metaclass = ABCMeta):
+  def __init__(self, dir = None, is_executable = False):
+    self.is_executable = is_executable
+    self.dir = dir
+
+  def __enter__(self):
+    self.file_handle, self.filename = tempfile.mkstemp(dir = self.dir)
+
+    with os.fdopen(self.file_handle, 'w') as f:
+      contents = self.get_initial_contents()
+      f.write(str(contents))
+
+    if self.is_executable:
+      make_executable(self.filename)
+
+  def __exit__(self, *args):
+    os.remove(self.filename)
+
+  @abstractmethod
+  def get_initial_contents(self):
+    raise NotImplementedError
+
+  @property
+  def name(self):
+    return self.filename
+
+
+class HostFile(TemporaryFileWrapper):
+  def __init__(self, nodes, devices_per_node):
+    super().__init__(is_executable = False)
+
+    if not isinstance(nodes, list) or len(nodes) == 0:
+      raise LogicError("[create_nodes_file] Empty list of nodes provided")
+    if devices_per_node is None or devices_per_node <= 0:
+      raise LogicError("[create_nodes_file] Incorrect number of `devices_per_node`")
+    self.nodes = sorted(nodes)
+    self.devices_per_node = devices_per_node
+
+  def get_initial_contents(self):
+    contents = ""
+    for node in self.nodes:
+      contents += '\n'.join([node] * self.devices_per_node) + '\n'
+    return contents
+
+class GPIScriptFile(TemporaryFileWrapper):
+  def __init__(self, header, environment, command, dir):
+    super().__init__(dir = dir, is_executable = True)
+    self.contents = [header,
+                     environment,
+                     command]
+
+  def get_initial_contents(self):
+    return '\n'.join(self.contents)
diff --git a/src/runtime/logging_config.py b/src/runtime/logging_config.py
new file mode 100644
index 00000000..a86e3078
--- /dev/null
+++ b/src/runtime/logging_config.py
@@ -0,0 +1,40 @@
+import logging
+import os
+
+from runtime import tf_config
+
+def setup_logging(logger, log_level, rank = 0, is_master_rank = True,
+                  log_on_all_devices = False):
+  do_logging = log_on_all_devices
+  if not do_logging:
+    do_logging = is_master_rank
+
+  if log_on_all_devices:
+    tnt_formatter_prefix = '[%(name)s] %(levelname)s: [rank %(rank)d] '
+  else:
+    tnt_formatter_prefix = '[%(name)s] %(levelname)s: '
+
+  formatter = logging.Formatter(tnt_formatter_prefix + '%(pathname)s:%(lineno)d: %(message)s')
+  logger.setLevel(log_level)
+  logger.addFilter(TntLoggingFilter(rank, do_logging))
+
+  handler = logging.StreamHandler()
+  handler.setLevel(logger.level)
+  handler.setFormatter(formatter)
+  logger.addHandler(handler)
+
+  tf_config.setup_logging(log_level = log_level,
+                          formatter_prefix = tnt_formatter_prefix,
+                          logging_filter = TntLoggingFilter(rank, do_logging))
+
+class TntLoggingFilter(logging.Filter):
+  def __init__(self, rank, do_logging):
+    super().__init__()
+    self.rank = rank
+    self.do_logging = do_logging
+
+  def filter(self, record):
+    record.rank = self.rank
+    if not self.do_logging:
+      return False
+    return True
diff --git a/src/runtime/platform_config.py b/src/runtime/platform_config.py
new file mode 100644
index 00000000..fdb7fa64
--- /dev/null
+++ b/src/runtime/platform_config.py
@@ -0,0 +1,63 @@
+import platform
+import os
+
+import runtime.tf_config as tf_config
+from runtime import logger
+
+def generate_nodes_list(hostfile = None):
+  if hostfile is None:
+    hostname = platform.node()
+    logger.debug("No `hostfile` provided. Using only the current node `{}`".format(hostname))
+    return [hostname]
+
+  if not os.path.isfile(hostfile):
+    raise ValueError("Incorrect `hostfile` provided with path `{}`".format(hostfile))
+
+  nodes_list = []
+  try:
+    with open(hostfile, 'r') as f:
+      nodes_list = f.readlines()
+  except:
+    raise ValueError("Cannot read from `hostfile` with path `{}`".format(hostfile))
+
+  if len(nodes_list) == 0:
+    raise ValueError("Empty `hostfile` with path `{}`".format(hostfile))
+  
+  unique_nodes = [node.strip() for node in set(nodes_list)]
+  if len(nodes_list) != len(set(nodes_list)):
+    logger.debug("The `hostfile` does not contain only unique hostnames; removing duplicates.")
+  return unique_nodes
+
+
+def generate_num_gpus_per_node(npernode = None):
+  num_physical_gpus = len(tf_config.get_available_gpus())
+  logger.debug("Num GPUs Available: {}".format(num_physical_gpus))
+
+  if npernode is None:  # use as many GPUs as possible
+    num_devices = num_physical_gpus
+
+  else: # the user requested a specific number of devices
+    if num_physical_gpus < npernode:
+      logger.debug("Not enough GPUs for the requested {} devices per node".format(npernode))
+      num_devices = 0
+    else:
+      num_devices = npernode
+  return num_devices
+
+def generate_num_devices_per_node(npernode = None, use_gpus = True):
+  num_gpus = 0
+  if use_gpus:
+    num_gpus = generate_num_gpus_per_node(npernode)
+
+  num_cpus = 0
+  if num_gpus == 0:
+    if npernode is None:  # use one rank per node
+      num_cpus = 1
+    else:
+      num_cpus = npernode
+
+  if use_gpus and num_gpus <= 0:
+    logger.warn("Cannot find {0} available GPUs per node as \
+requested; using {0} ranks on CPUs instead".format(num_cpus))
+
+  return num_gpus, num_cpus
\ No newline at end of file
diff --git a/src/runtime/tf_config.py b/src/runtime/tf_config.py
new file mode 100644
index 00000000..224152c4
--- /dev/null
+++ b/src/runtime/tf_config.py
@@ -0,0 +1,32 @@
+import logging
+import os
+import tensorflow as tf
+
+from runtime import environment_config
+
+def get_available_gpus():
+  """ Checks whether there are GPUs available on the machine and assigns one
+  to the current rank.
+  """
+  phys_gpus = tf.config.experimental.list_physical_devices('GPU')
+  if phys_gpus is None:
+    phys_gpus = []
+  return phys_gpus
+
+
+_tf_logging_defaults = {'TF_CPP_MIN_LOG_LEVEL' : '3',
+                        }
+
+def setup_logging(log_level, formatter_prefix, logging_filter):
+  tf_env = environment_config.collect_tensorflow_variables()
+  for var,value in _tf_logging_defaults.items():
+    if not var in tf_env:
+      os.environ[var] = value
+
+  tf_logger = tf.get_logger()
+  tf_logger.addFilter(logging_filter)
+  for h in tf_logger.handlers:
+    tf_logger_format = h.formatter._fmt.replace('%(levelname)s:', '').replace('%(name)s:','')
+    tf_logger_format = formatter_prefix + tf_logger_format
+    formatter = logging.Formatter(tf_logger_format)
+    h.setFormatter(formatter)
diff --git a/src/runtime/tnt_config.py b/src/runtime/tnt_config.py
new file mode 100644
index 00000000..4583c8e7
--- /dev/null
+++ b/src/runtime/tnt_config.py
@@ -0,0 +1,82 @@
+import enum
+import os
+
+TARANTELLA_ENV_VAR_PREFIX = "TNT_"
+
+class TNTConfig(enum.Enum):
+  TNT_GPUS_PER_NODE = 'TNT_GPUS_PER_NODE'
+  TNT_OUTPUT_ON_ALL_DEVICES = 'TNT_OUTPUT_ON_ALL_DEVICES'
+  TNT_LOG_ON_ALL_DEVICES = 'TNT_LOG_ON_ALL_DEVICES'
+  TNT_TENSORBOARD_ON_ALL_DEVICES = 'TNT_TENSORBOARD_ON_ALL_DEVICES'
+  TNT_LOG_DIR = 'TNT_LOG_DIR'
+  TNT_LOG_LEVEL = 'TNT_LOG_LEVEL'
+  TNT_FUSION_THRESHOLD = 'TNT_FUSION_THRESHOLD'
+
+class TarantellaConfigurationDefaults:
+  @classmethod
+  def config(self):
+    default_config = { TNTConfig.TNT_GPUS_PER_NODE : None,
+                       TNTConfig.TNT_FUSION_THRESHOLD : 32 * 1024,
+                       TNTConfig.TNT_OUTPUT_ON_ALL_DEVICES : 'False',
+                       TNTConfig.TNT_LOG_ON_ALL_DEVICES : 'False',
+                       TNTConfig.TNT_TENSORBOARD_ON_ALL_DEVICES : "False",
+                       TNTConfig.TNT_LOG_DIR : None,
+                       TNTConfig.TNT_LOG_LEVEL : "WARN",
+                     }
+    return default_config
+
+def get_configuration_from_env(filter_prefix = None):
+  config = dict()
+  for key in os.environ:
+    if filter_prefix is None:
+      config[key] = os.environ[key]
+    else:
+      if key.startswith(filter_prefix):
+        config[key] = os.environ[key]
+  return config
+
+class TarantellaConfiguration:
+  def __init__(self):
+    self.tarantella_env_prefix = TARANTELLA_ENV_VAR_PREFIX
+    self.config = get_configuration_from_env(self.tarantella_env_prefix)
+
+  def get_variable_or_default(self, variable_name):
+    env_var_name = TNTConfig(variable_name).name
+    value = self.config.get(env_var_name)
+    if value is None:
+      value = TarantellaConfigurationDefaults.config().get(variable_name)
+    return value
+
+  @property
+  def gpus_per_node(self):
+    gpus_per_node_string = self.get_variable_or_default(TNTConfig.TNT_GPUS_PER_NODE)
+    if gpus_per_node_string is None:
+      return None
+    return int(gpus_per_node_string)
+
+  @property
+  def output_on_all_devices(self):
+    value_string = self.get_variable_or_default(TNTConfig.TNT_OUTPUT_ON_ALL_DEVICES)
+    return value_string.lower() == 'true'
+
+  @property
+  def log_on_all_devices(self):
+    value_string = self.get_variable_or_default(TNTConfig.TNT_LOG_ON_ALL_DEVICES)
+    return value_string.lower() == 'true'
+
+  @property
+  def tensorboard_on_all_devices(self):
+    value_string = self.get_variable_or_default(TNTConfig.TNT_TENSORBOARD_ON_ALL_DEVICES)
+    return value_string.lower() == "true"
+
+  @property
+  def log_dir(self):
+    return self.get_variable_or_default(TNTConfig.TNT_LOG_DIR)
+  
+  @property
+  def log_level(self):
+    return self.get_variable_or_default(TNTConfig.TNT_LOG_LEVEL)
+
+  @property
+  def fusion_threshold(self):
+    return int(self.get_variable_or_default(TNTConfig.TNT_FUSION_THRESHOLD))
diff --git a/src/tarantella/__init__.py b/src/tarantella/__init__.py
new file mode 100644
index 00000000..c79632fa
--- /dev/null
+++ b/src/tarantella/__init__.py
@@ -0,0 +1,196 @@
+import numpy as np
+import tensorflow as tf
+import GPICommLib
+
+import runtime.tnt_config as tnt_config
+global_context = None
+global_tnt_config = tnt_config.TarantellaConfiguration()
+
+import logging
+logger = logging.getLogger("TNT_LIB")
+
+import runtime.logging_config as logging_config
+import runtime.tf_config as tf_config
+from tarantella.model import Model
+import tarantella.optimizers as optimizers
+import tarantella.optimizers.synchronous_distributed_optimizer as distributed_optimizers
+from tnt_tfops import tnt_ops
+from tarantella import models
+
+import sys
+
+def setup_gpus(rank, ngpus = None):
+  """Checks whether there are GPUs available on the machine and assigns one
+  to the current rank.
+
+  To make sure a specific GPU will be used by the current rank, TensorFlow is 
+  configured so that this particular GPU is the only one visible.
+  A GPU is selected if its index within the list of available GPUs is equal to
+  (rank % ngpus).
+  This allocation assumes that all nodes are homogeneous and are configured with
+  the same number of processes (< ngpus).
+ 
+  Args:
+    rank: int, rank of the current process
+    
+    ngpus: int value specifying the maximum number of GPUs per node that will 
+    be used.
+    """
+  if ngpus is None or ngpus <= 0:
+    # Disable all GPUs
+    tf.config.experimental.set_visible_devices([], 'GPU')
+    visible_gpus = tf.config.experimental.get_visible_devices('GPU')
+    if visible_gpus and len(visible_gpus) > 0:
+      sys.exit("ERROR: [rank {}] Could not disable GPUs: {} GPUs still visible".format(
+               rank, len(visible_gpus)))
+  else: # try to use `ngpus` per node  
+    phys_gpus = tf_config.get_available_gpus()
+    if phys_gpus and len(phys_gpus) > 0:
+      target_gpu = rank % ngpus
+      if len(phys_gpus) < ngpus:
+        sys.exit("ERROR: rank {} cannot use GPU_id={} (only {} GPUs available)".format(
+                rank, target_gpu, len(phys_gpus)))
+
+      try:
+        # memory growth has to be set only once on all availble GPUs
+        if target_gpu == 0:
+          for gpu in phys_gpus:
+            tf.config.experimental.set_memory_growth(gpu, True)
+        # make sure only one GPU is visible per process
+        tf.config.experimental.set_visible_devices(phys_gpus[target_gpu], 'GPU')
+      except RuntimeError:
+        raise RuntimeError("[Tarantella][init] Cannot configure GPUs")
+  logger.debug("Using device: {}".format(tf.config.experimental.get_visible_devices()))
+
+def init(devices_per_node = None):
+  global global_context
+  if global_context is None:
+    global_context = GPICommLib.GPIContext()
+
+    logging_config.setup_logging(logger, global_tnt_config.log_level,
+                                 get_rank(), is_master_rank(),
+                                 global_tnt_config.log_on_all_devices)
+
+    # configure GPUs if a number of GPUs per node is specified, either as a parameter
+    # or as a `TNT_GPUS_PER_NODE` environment variable
+    if devices_per_node is None:
+      devices_per_node = global_tnt_config.gpus_per_node
+    setup_gpus(global_context.rank, ngpus = devices_per_node)
+
+def get_rank():
+  return global_context.rank
+
+def get_master_rank():
+  return 0
+
+def is_master_rank():
+  return get_rank() == get_master_rank()
+
+def get_size():
+  return global_context.size
+
+def get_tensor_info(tensor_id, tensor):
+  return GPICommLib.TensorInfo(tensor_id,
+                               int(np.prod(tensor.shape)), 
+                               np.dtype(tf.dtypes.as_dtype(tensor.dtype).as_numpy_dtype()))
+
+class TensorBroadcaster():
+  def __init__(self, tensor_list, root_rank):
+    self.context = global_context
+    self.root_rank = root_rank
+
+    tensor_infos = [get_tensor_info(tid, tensor) for tid, tensor in enumerate(tensor_list)]
+    self.broadcaster = GPICommLib.TensorBroadcaster(self.context,
+                                                    tensor_infos,
+                                                    self.root_rank)
+
+  def broadcast(self, tensor_list):
+    self.broadcaster.broadcast(tensor_list)
+
+class Barrier():
+  def __init__(self):
+    self.barrier = GPICommLib.Barrier(global_context)
+
+  def synchronize(self):
+    self.barrier.blocking_barrier_all_ranks()
+
+class SynchCommunicator():
+  def __init__(self, global_context):
+    self.context = global_context
+    self.weight_to_index = dict()
+    self.comm = None
+    self.threshold = global_tnt_config.fusion_threshold
+
+  def setup_infrastructure(self, gradients_and_weights):
+    """ Setup state and allocate GPI segments
+    """
+    # Define gradient IDs associated with each weight, indexed by the weights' names
+    # Assumption: the order in which the weights are provided is deterministic
+    # (based on the internal TF graph description), so that all ranks process the
+    # weights in the same order
+    running_grad_id = 0
+    for grad, weight in gradients_and_weights:
+      self.weight_to_index[weight.name] = running_grad_id
+      running_grad_id += 1
+
+    # initialize the internal `SynchCommunicator` corresponding to the provided list of gradients
+    grad_infos = list()
+    for grad, weight in gradients_and_weights:
+      grad_infos.append(get_tensor_info(self.weight_to_index[weight.name], grad))
+    self.comm = GPICommLib.SynchDistCommunicator(global_context, grad_infos, self.threshold)
+
+  def reduce_gradients(self, gradients_and_weights):
+    gradients_to_reduce = list()
+    for grad, weight in gradients_and_weights:
+      # add an Allreduce operation for each gradient
+      grad_id = self.weight_to_index[weight.name]
+      output_grad = tnt_ops.start_allreduce_op(grad, tensor_id = grad_id,
+                                              tnt_synchcomm = self.comm.get_raw_ptr())
+      gradients_to_reduce.append(output_grad)
+
+    # Create barrier op in the Tensorflow graph to make sure all 
+    # the Allreduce operations on gradients have started.
+    # This ensures that the graph execution does not get delayed by waiting 
+    # for gradients to be reduced as long as there are remaining computations 
+    # in the backward pass.
+    temp_gradients = tnt_ops.barrier_op(gradients_to_reduce, 
+                                         Tout = [tf.float32] * len(gradients_to_reduce))
+
+    # Add individual ops that wait for each gradient to be reduced before updating 
+    # the weights.
+    # These ops are executed only after the backward pass has been completed.
+    reduced_gradients = list()
+    for idx, (_, weight) in enumerate(gradients_and_weights):
+      # gradient tensors obtained after barrier are listed in the same order 
+      # as the initial `gradients_and_weights`
+      gradient = temp_gradients[idx]
+      grad_id = self.weight_to_index[weight.name]
+
+      output_grad = tnt_ops.finish_allreduce_op(gradient,
+                                                tensor_id = grad_id,
+                                                Tout = tf.float32,
+                                                tnt_synchcomm = self.comm.get_raw_ptr())
+      reduced_gradients.append(output_grad)
+    return reduced_gradients
+
+
+class PipelineCommunicator:
+  def __init__(self, pipeline_comm):
+    # TODO: initialize pipeline communicator binding 
+    self.pipeline_comm_ptr = pipeline_comm.get_raw_ptr()
+    pass
+
+  def send(self, input, connection_id, micro_batch_id):
+    return tnt_ops.send_op(input, 
+                           connection_id = connection_id,
+                           micro_batch_id = micro_batch_id,
+                           tnt_pipeline_comm = self.pipeline_comm_ptr)
+
+
+  def recv(self, input, connection_id, micro_batch_id, output_shape):
+    return tnt_ops.recv_op(input,
+                           connection_id = connection_id,
+                           micro_batch_id = micro_batch_id,
+                           tnt_pipeline_comm = self.pipeline_comm_ptr,
+                           output_shape = output_shape)
+
diff --git a/src/tarantella/datasets/dataset_helpers.py b/src/tarantella/datasets/dataset_helpers.py
new file mode 100644
index 00000000..f245a3fa
--- /dev/null
+++ b/src/tarantella/datasets/dataset_helpers.py
@@ -0,0 +1,259 @@
+import copy 
+
+import tensorflow as tf
+from tensorflow.python.data.ops import dataset_ops as ds
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import gen_dataset_ops
+
+from tarantella import logger
+import tarantella.datasets.ops as tnt_ops
+
+def _get_transformation_info_batch(dataset):
+  kwargs = {"batch_size": dataset._batch_size,
+            "drop_remainder": dataset._drop_remainder}
+  return (ds.BatchDataset, kwargs)
+
+def _get_transformation_info_cache(dataset):
+  kwargs = {"filename": dataset._filename}
+  return (ds.CacheDataset, kwargs)
+
+def _get_transformation_info_concatenate(dataset):
+  kwargs = {"dataset_to_concatenate": dataset._dataset_to_concatenate}
+  return (ds.ConcatenateDataset, kwargs)
+
+def _get_transformation_info_filter(dataset):
+  kwargs = {"predicate": dataset._predicate}
+  return (tnt_ops.TntFilterDataset, kwargs)
+
+def _get_transformation_info_flatmap(dataset):
+  kwargs = {"map_func": dataset._map_func}
+  return (tnt_ops.TntFlatMapDataset, kwargs)
+
+def _get_transformation_info_interleave(dataset):
+  kwargs = {"map_func": dataset._map_func,
+            "cycle_length": dataset._cycle_length,
+            "block_length": dataset._block_length}
+  return (tnt_ops.TntInterleaveDataset, kwargs)
+
+def _get_transformation_info_map(dataset):
+  kwargs = {"map_func": dataset._map_func,
+            "use_inter_op_parallelism": dataset._use_inter_op_parallelism,
+            "preserve_cardinality": dataset._preserve_cardinality}
+  return (tnt_ops.TntMapDataset, kwargs)
+
+def _get_transformation_info_paddedbatch(dataset):
+  kwargs = {"batch_size": dataset._batch_size,
+            "padded_shapes": dataset._padded_shapes,
+            "padding_values": dataset._padding_values,
+            "drop_remainder": dataset._drop_remainder}
+  return (tnt_ops.TntPaddedBatchDataset, kwargs)
+
+def _get_transformation_info_parallelinterleave(dataset):
+  # bug in TF2.2: `deterministic` is not saved as an attribute
+  deterministic = "default"
+  if hasattr(dataset, '_deterministic'):
+    deterministic = dataset._deterministic
+
+  kwargs = {"map_func": dataset._map_func,
+            "cycle_length": dataset._cycle_length,
+            "block_length": dataset._block_length,
+            "num_parallel_calls": dataset._num_parallel_calls,
+            "deterministic": deterministic}
+
+  # support for TF2.0 - does not have `buffer_output_elements`
+  if hasattr(dataset, '_buffer_output_elements'):
+    kwargs['buffer_output_elements'] = dataset._buffer_output_elements
+
+  if hasattr(dataset, '_prefetch_input_elements'):
+    kwargs['prefetch_input_elements'] = dataset._prefetch_input_elements
+
+  return (tnt_ops.TntParallelInterleaveDataset, kwargs)
+
+def _get_transformation_info_parallelmap(dataset):
+  deterministic = "default"
+  if hasattr(dataset, '_deterministic'):
+    deterministic = dataset._deterministic
+
+  kwargs = {"map_func": dataset._map_func,
+            "use_inter_op_parallelism": dataset._use_inter_op_parallelism,
+            "num_parallel_calls": dataset._num_parallel_calls,
+            "preserve_cardinality": dataset._preserve_cardinality,
+            "deterministic": deterministic}
+  return (tnt_ops.TntParallelMapDataset, kwargs)
+
+def _get_transformation_info_prefetch(dataset):
+  buffer_size = dataset._buffer_size
+  # TF2.2: https://github.com/tensorflow/tensorflow/blob/v2.2.0/tensorflow/python/data/ops/dataset_ops.py#L4255
+  if buffer_size == -1:
+    buffer_size = None
+  kwargs = {"buffer_size" : dataset._buffer_size}
+  return (ds.PrefetchDataset, kwargs)
+
+def _get_transformation_info_repeat(dataset):
+  count = dataset._count
+  if count == -1:
+    count = None
+  kwargs = {"count": count}
+  return (ds.RepeatDataset, kwargs)
+
+def _get_transformation_info_shard(dataset):
+  kwargs =  {"num_shards": dataset._num_shards,
+             "index": dataset._index}
+  return (ds.ShardDataset, kwargs)
+
+def _get_transformation_info_shuffle(dataset):
+  # TF 2.0 - 2.2
+  # ShuffleDataset does not save the given seed
+  # instead it has two seed properties defined as
+  # `self._seed, self._seed2 = random_seed.get_seed(seed)`
+  # with `get_seed` defined in `tensorflow/python/framework/random_seed.py` [TF2.2]
+  if dataset._seed2 == 0:
+    # there was no seed specified by the user
+    seed = None
+  else:
+    seed = dataset._seed2
+  kwargs = {"buffer_size": dataset._buffer_size,
+            "seed": seed,
+            "reshuffle_each_iteration": dataset._reshuffle_each_iteration}
+  return (ds.ShuffleDataset, kwargs)
+
+def _get_transformation_info_skip(dataset):
+  kwargs = {"count": dataset._count}
+  return (ds.SkipDataset, kwargs)
+
+def _get_transformation_info_take(dataset):
+  kwargs = {"count": dataset._count}
+  return (ds.TakeDataset, kwargs)
+
+def _get_transformation_info_unbatch(dataset):
+  kwargs = {}
+  return (ds._UnbatchDataset, kwargs)
+
+def _get_transformation_info_window(dataset):
+  kwargs = {"size": dataset._size,
+            "shift": dataset._shift,
+            "stride": dataset._stride,
+            "drop_remainder": dataset._drop_remainder}
+  return (ds.WindowDataset, kwargs)
+
+def _get_transformation_info_withoptions(dataset):
+  kwargs = {"options": dataset._options}
+  return (ds._OptionsDataset, kwargs)
+
+_transformations = {ds.BatchDataset : _get_transformation_info_batch,
+                    ds.CacheDataset : _get_transformation_info_cache,
+                    ds.ConcatenateDataset: _get_transformation_info_concatenate,
+                    ds.FilterDataset : _get_transformation_info_filter,
+                    ds.FlatMapDataset : _get_transformation_info_flatmap,
+                    ds.InterleaveDataset : _get_transformation_info_interleave,
+                    ds.MapDataset : _get_transformation_info_map,
+                    ds.PaddedBatchDataset : _get_transformation_info_paddedbatch,
+                    ds.ParallelInterleaveDataset : _get_transformation_info_parallelinterleave,
+                    ds.ParallelMapDataset : _get_transformation_info_parallelmap,
+                    ds.PrefetchDataset : _get_transformation_info_prefetch,
+                    ds.RepeatDataset : _get_transformation_info_repeat,
+                    ds.ShardDataset : _get_transformation_info_shard,
+                    ds.ShuffleDataset : _get_transformation_info_shuffle,
+                    ds.SkipDataset : _get_transformation_info_skip,
+                    ds.TakeDataset : _get_transformation_info_take,
+                    ds._UnbatchDataset : _get_transformation_info_unbatch,
+                    ds.WindowDataset : _get_transformation_info_window,
+                    ds._OptionsDataset : _get_transformation_info_withoptions,
+                    }
+
+def gen_dataset_transformations(dataset):
+  """Generate the list of transformations that has been applied to a dataset
+     Returns: tuple(original dataset, list of transformations)
+  """
+  stack = []
+  while (hasattr(dataset, '_input_dataset')): # Stops when the initial dataset is encountered,
+                                              # or a zipped/enumerated dataset is found
+    identified_transf = False
+    for transformation in _transformations:
+      if isinstance(dataset, transformation):
+        stack.append(_transformations[transformation](dataset))
+        identified_transf = True
+        break
+    if not identified_transf:
+      raise RuntimeError("Unknown transformation provided: {}.".format(dataset._transformation_name))
+    dataset = dataset._input_dataset
+  return (dataset, list(reversed(stack)))
+
+
+class BatchingOpInfo:
+  def __init__(self, is_batched, last_batching_index = None,
+               transformation = None, params = None):
+    self._is_batched = is_batched
+    self._last_batching_index = last_batching_index
+    self._transformation = transformation
+    self.set_kwargs_properties(params)
+
+  @property
+  def is_batched(self):
+    return self._is_batched
+
+  @property
+  def batch_size(self):
+    return self._batch_size
+
+  @property
+  def drop_remainder(self):
+    return self._drop_remainder
+
+  def is_last_batching_transformation(self, index):
+    return index == self._last_batching_index
+
+  def set_kwargs_properties(self, ds_kwargs):
+    if not self.is_batched:
+      return
+    ds_kwargs = ds_kwargs if isinstance(ds_kwargs, dict) else {}
+
+    self._drop_remainder = None
+    if 'drop_remainder' in ds_kwargs:
+      self._drop_remainder = ds_kwargs.pop('drop_remainder')
+
+    if not 'batch_size' in ds_kwargs:
+      raise KeyError("[DistributedDataset] Batch transformation defined without batch size")
+    self._batch_size = ds_kwargs.pop('batch_size')
+    self._additional_kwargs = ds_kwargs
+
+  def apply(self, dataset, new_batch_size):
+    if not self.is_batched:
+      raise RuntimeError("[BatchingOpInfo] Cannot apply batching transformation: dataset is unbatched.")
+    kwargs = self._additional_kwargs
+    kwargs['batch_size'] = new_batch_size
+    kwargs['drop_remainder'] = self.drop_remainder
+    return self._transformation(dataset, **kwargs)
+
+
+def get_batching_info(dataset_transformations):
+  last_batch_transf_index = None
+  for index, (transf, ds_kwargs) in enumerate(reversed(dataset_transformations)):
+    if transf in [ds.BatchDataset, tnt_ops.TntPaddedBatchDataset]:
+      last_batch_transf_index = len(dataset_transformations) - index - 1
+      return BatchingOpInfo(is_batched = True,
+                            last_batching_index = last_batch_transf_index,
+                            transformation = transf,
+                            params = copy.deepcopy(ds_kwargs))
+  return BatchingOpInfo(is_batched = False)
+
+
+def get_num_samples(dataset):
+  cardinality = tf.data.experimental.cardinality(dataset)
+
+  if cardinality == tf.data.experimental.INFINITE_CARDINALITY:
+    logger.debug("Infinite dataset detected.")
+    return tf.data.experimental.INFINITE_CARDINALITY
+
+  if cardinality != tf.data.experimental.UNKNOWN_CARDINALITY:
+    logger.debug("Dataset size is %d" % (cardinality.numpy()))
+    return cardinality.numpy()
+
+  logger.debug("Unknown dataset size. Counting samples...")
+  dataset_size = 0
+  for d in dataset:
+    dataset_size += 1
+  logger.debug("Dataset size is %d" % (dataset_size))
+  return dataset_size
+
diff --git a/src/tarantella/datasets/distributed_dataset.py b/src/tarantella/datasets/distributed_dataset.py
new file mode 100644
index 00000000..00db4fce
--- /dev/null
+++ b/src/tarantella/datasets/distributed_dataset.py
@@ -0,0 +1,114 @@
+import tensorflow as tf
+from tensorflow.python.data.ops import dataset_ops as ds
+
+from tarantella import logger
+import tarantella.datasets.dataset_helpers as ds_helpers
+
+class DistributedDataset:
+  def __init__(self, dataset, num_ranks, rank, shuffle_seed = 42):
+    self.num_ranks = num_ranks
+    self.rank = rank
+    self.shuffle_seed = shuffle_seed
+
+    self.dataset = dataset
+    self.base_dataset, self.dataset_transformations = \
+           ds_helpers.gen_dataset_transformations(dataset)
+    self.batching_info = ds_helpers.get_batching_info(self.dataset_transformations)
+
+  def distribute_dataset_across_ranks(self, user_micro_batch_size = None, is_training = True):
+    dataset = self.base_dataset
+
+    # Batched datsets:
+    # re-apply dataset transformations identically, except for batching & shuffling
+    for index, (transf, ds_kwargs) in enumerate(self.dataset_transformations):
+      # shuffle operation
+      if isinstance(transf(dataset, **ds_kwargs), ds.ShuffleDataset):
+        dataset = self.shuffle_with_seed(dataset, ds_kwargs)
+
+      # batch operation (i.e., `batch` or `padded_batch`)
+      elif self.batching_info.is_last_batching_transformation(index):
+        batch_size = self.batching_info.batch_size
+        if user_micro_batch_size:
+          micro_batch_size = user_micro_batch_size
+          if micro_batch_size * self.num_ranks != batch_size:
+            raise ValueError("[DistributedDataset] micro batch size ({}) is not consistent \
+with batch size ({}) on number of devices used ({}).".format(micro_batch_size, batch_size,
+                                                            self.num_ranks))
+        else:
+          micro_batch_size = self.get_microbatch_size(batch_size)
+
+        if is_training:
+          dataset = self.distributed_batch(dataset,
+                                           batch_size = batch_size,
+                                           micro_batch_size = micro_batch_size)
+        else:
+          # FIXME: distribute batch for `evaluate` and `predict`
+          dataset = self.batching_info.apply(dataset, new_batch_size = micro_batch_size)
+
+      # other operations
+      else:
+        dataset = transf(dataset, **ds_kwargs)
+
+    # Unbatched datasets
+    if self.batching_info.is_batched == False:
+      if is_training == False:    # outside `fit`
+        if user_micro_batch_size:
+          dataset = self.batching_info.apply(dataset, new_batch_size = micro_batch_size)
+        else:
+          dataset = self.batching_info.apply(dataset, new_batch_size = 1)
+
+      if is_training == True:     # inside `fit`
+        if user_micro_batch_size:
+          micro_batch_size = user_micro_batch_size
+          batch_size = micro_batch_size * self.num_ranks
+          dataset = self.distributed_batch(dataset,
+                                          batch_size = batch_size,
+                                          micro_batch_size = micro_batch_size)
+        else:
+          raise ValueError("[DistributedDataset] Unbatched datasets without tnt_micro_batch_size are not supported")
+
+    return dataset
+
+  def shuffle_with_seed(self, dataset, ds_kwargs):
+    if not 'seed' in ds_kwargs or ds_kwargs['seed'] is None:
+      logger.warn("Shuffling with fixed shuffle seed {}.".format(self.shuffle_seed))
+      ds_kwargs['seed'] = self.shuffle_seed
+    else:
+      logger.debug("Shuffling with shuffle seed {}.".format(ds_kwargs['seed']))
+    return dataset.shuffle(**ds_kwargs)
+
+  def distributed_batch(self, dataset, batch_size, micro_batch_size):
+    if self.batching_info.drop_remainder == True:
+      dataset = self.batching_info.apply(dataset, new_batch_size = batch_size)
+      dataset = dataset.unbatch()
+
+    else: # no drop remainder
+      num_samples = ds_helpers.get_num_samples(dataset)
+      if num_samples == tf.data.experimental.INFINITE_CARDINALITY:
+        raise ValueError("[DistributedDataset] Infinite dataset provided")
+
+      # Total number of samples is not multiple of the batch size
+      if num_samples % batch_size != 0:
+        logger.warn("Number of samples ({}) is not a multiple of batch size.\
+ Removing the last incomplete batch from the dataset.".format(num_samples))
+        num_samples_multiple = (num_samples // batch_size) * batch_size
+        dataset = dataset.take(num_samples_multiple)
+
+    dataset = self.batching_info.apply(dataset, new_batch_size = micro_batch_size)
+    dataset = dataset.shard(num_shards=self.num_ranks, index = self.rank)
+
+    logger.info("Using batch size = {}, micro batch size = {}.".format(
+                batch_size, micro_batch_size))
+    return dataset
+
+  def get_microbatch_size(self, batch_size):
+    if batch_size is None or batch_size == 0:
+      raise ValueError("[DistributedDataset]Incorrectly defined batch size")
+
+    if batch_size % self.num_ranks != 0:
+      raise ValueError("[DistributedDataset] Batch size ({}) is not a multiple".format(batch_size) +
+                       "of the number of ranks {}".format(self.num_ranks))
+
+    logger.debug("Batch size ({}) is a multiple of the number of ranks {}.".format(
+                 batch_size, self.num_ranks))
+    return int(batch_size // self.num_ranks)
diff --git a/src/tarantella/datasets/ops/__init__.py b/src/tarantella/datasets/ops/__init__.py
new file mode 100644
index 00000000..96e09150
--- /dev/null
+++ b/src/tarantella/datasets/ops/__init__.py
@@ -0,0 +1,7 @@
+from tarantella.datasets.ops.tnt_filter import TntFilterDataset
+from tarantella.datasets.ops.tnt_flatmap import TntFlatMapDataset
+from tarantella.datasets.ops.tnt_interleave import TntInterleaveDataset
+from tarantella.datasets.ops.tnt_map import TntMapDataset
+from tarantella.datasets.ops.tnt_parallel_interleave import TntParallelInterleaveDataset
+from tarantella.datasets.ops.tnt_parallel_map import TntParallelMapDataset
+from tarantella.datasets.ops.tnt_padded_batch import TntPaddedBatchDataset
\ No newline at end of file
diff --git a/src/tarantella/datasets/ops/tnt_filter.py b/src/tarantella/datasets/ops/tnt_filter.py
new file mode 100644
index 00000000..8fb7f74d
--- /dev/null
+++ b/src/tarantella/datasets/ops/tnt_filter.py
@@ -0,0 +1,26 @@
+import tensorflow as tf
+from tensorflow.python.data.ops import dataset_ops as ds
+from tensorflow.python.ops import gen_dataset_ops
+
+class TntFilterDataset(ds.UnaryUnchangedStructureDataset):
+  """A `Dataset` that filters its input according to a predicate function."""
+  def __init__(self,
+               input_dataset,
+               predicate,
+               use_legacy_function=False):
+    """See `Dataset.filter()` for details."""
+    self._input_dataset = input_dataset
+    self._predicate = predicate # StructuredFunctionWrapper
+
+    variant_tensor = gen_dataset_ops.filter_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        other_arguments=self._predicate.function.captured_inputs,
+        predicate=self._predicate.function,
+        **self._flat_structure)
+    super(TntFilterDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  def _transformation_name(self):
+    return "Dataset.filter()"
diff --git a/src/tarantella/datasets/ops/tnt_flatmap.py b/src/tarantella/datasets/ops/tnt_flatmap.py
new file mode 100644
index 00000000..794045c2
--- /dev/null
+++ b/src/tarantella/datasets/ops/tnt_flatmap.py
@@ -0,0 +1,29 @@
+import tensorflow as tf
+from tensorflow.python.data.ops import dataset_ops as ds
+from tensorflow.python.ops import gen_dataset_ops
+
+class TntFlatMapDataset(ds.UnaryDataset):
+  """A `Dataset` that maps a function over the elements in its input and flattens the result."""
+  def __init__(self,
+               input_dataset,
+               map_func):
+    """See `Dataset.flat_map()` for details."""
+    self._input_dataset = input_dataset
+    self._map_func = map_func # StructuredFunctionWrapper
+
+    variant_tensor = gen_dataset_ops.flat_map_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
+        **self._flat_structure)
+    super(TntFlatMapDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def element_spec(self):
+    return self._map_func.output_structure._element_spec 
+
+  def _transformation_name(self):
+    return "Dataset.flat_map()"
diff --git a/src/tarantella/datasets/ops/tnt_interleave.py b/src/tarantella/datasets/ops/tnt_interleave.py
new file mode 100644
index 00000000..2eb153f8
--- /dev/null
+++ b/src/tarantella/datasets/ops/tnt_interleave.py
@@ -0,0 +1,35 @@
+import tensorflow as tf
+from tensorflow.python.data.ops import dataset_ops as ds
+from tensorflow.python.ops import gen_dataset_ops
+
+class TntInterleaveDataset(ds.UnaryDataset):
+  """A `Dataset` that interleaves the result of transformed inputs."""
+  def __init__(self,
+               input_dataset,
+               map_func,
+               cycle_length,
+               block_length):
+    """See `Dataset.interleave()` for details."""
+    self._input_dataset = input_dataset
+    self._map_func = map_func # StructuredFunctionWrapper
+    self._cycle_length = cycle_length
+    self._block_length = block_length
+    
+    variant_tensor = gen_dataset_ops.interleave_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,  # pylint: disable=protected-access
+        self._cycle_length,
+        self._block_length,
+        f=self._map_func.function,
+        **self._flat_structure)
+    super(TntInterleaveDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def element_spec(self):
+    return self._map_func.output_structure._element_spec
+
+  def _transformation_name(self):
+    return "Dataset.interleave()"
diff --git a/src/tarantella/datasets/ops/tnt_map.py b/src/tarantella/datasets/ops/tnt_map.py
new file mode 100644
index 00000000..642dfda9
--- /dev/null
+++ b/src/tarantella/datasets/ops/tnt_map.py
@@ -0,0 +1,36 @@
+import tensorflow as tf
+from tensorflow.python.data.ops import dataset_ops as ds
+from tensorflow.python.ops import gen_dataset_ops
+
+class TntMapDataset(ds.UnaryDataset):
+  """A `Dataset` that maps a function over the elements in its input."""
+  def __init__(self,
+               input_dataset,
+               map_func,
+               use_inter_op_parallelism=True,
+               preserve_cardinality=False,
+               use_legacy_function=False):
+    """See `Dataset.map()` for details."""
+    self._input_dataset = input_dataset
+    self._use_inter_op_parallelism = use_inter_op_parallelism
+    self._preserve_cardinality = preserve_cardinality
+    self._map_func = map_func # StructuredFunctionWrapper
+
+    variant_tensor = gen_dataset_ops.map_dataset(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
+        use_inter_op_parallelism=self._use_inter_op_parallelism,
+        preserve_cardinality=self._preserve_cardinality,
+        **self._flat_structure)
+    super(TntMapDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def element_spec(self):
+    return self._map_func.output_structure
+
+  def _transformation_name(self):
+    return "Dataset.map()"
diff --git a/src/tarantella/datasets/ops/tnt_padded_batch.py b/src/tarantella/datasets/ops/tnt_padded_batch.py
new file mode 100644
index 00000000..1149d3d6
--- /dev/null
+++ b/src/tarantella/datasets/ops/tnt_padded_batch.py
@@ -0,0 +1,56 @@
+import tensorflow as tf
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.data.ops import dataset_ops as ds
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
+from tensorflow.python.ops import gen_dataset_ops
+
+class TntPaddedBatchDataset(ds.UnaryDataset):
+  """A `Dataset` that batches and pads contiguous elements from its input."""
+  def __init__(self,
+               input_dataset,
+               batch_size,
+               padded_shapes,
+               padding_values,
+               drop_remainder,
+               ):
+    """See `Dataset.batch()` for details."""
+    self._input_dataset = input_dataset
+    self._batch_size = batch_size
+    self._padded_shapes = padded_shapes
+    self._padding_values = padding_values
+    self._drop_remainder = drop_remainder
+    
+    def _padded_shape_to_batch_shape(s):
+      return tensor_shape.TensorShape([
+          tensor_util.constant_value(self._batch_size)
+          if smart_cond.smart_constant_value(self._drop_remainder) else None
+      ]).concatenate(tensor_util.constant_value_as_shape(s))
+
+    output_shapes = nest.map_structure(
+        _padded_shape_to_batch_shape, self._padded_shapes)
+    self._structure = structure.convert_legacy_structure(
+        ds.get_legacy_output_types(self._input_dataset), output_shapes,
+        ds.get_legacy_output_classes(self._input_dataset))
+
+    variant_tensor = gen_dataset_ops.padded_batch_dataset_v2(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          batch_size=self._batch_size,
+          padded_shapes=[ ops.convert_to_tensor(s, dtype=dtypes.int64)
+                          for s in nest.flatten(self._padded_shapes)
+                        ],
+          padding_values=nest.flatten(self._padding_values),
+          drop_remainder=self._drop_remainder,
+          output_shapes=structure.get_flat_tensor_shapes(self._structure))
+    super(TntPaddedBatchDataset, self).__init__(input_dataset, variant_tensor)
+
+  @property
+  def element_spec(self):
+    return self._structure
+
+  def _transformation_name(self):
+    return "Dataset.padded_batch()"
\ No newline at end of file
diff --git a/src/tarantella/datasets/ops/tnt_parallel_interleave.py b/src/tarantella/datasets/ops/tnt_parallel_interleave.py
new file mode 100644
index 00000000..5e53d746
--- /dev/null
+++ b/src/tarantella/datasets/ops/tnt_parallel_interleave.py
@@ -0,0 +1,71 @@
+from tensorflow.python.data.ops import dataset_ops as ds
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import gen_dataset_ops
+
+class TntParallelInterleaveDataset(ds.UnaryDataset):
+  """A `Dataset` that maps a function over its input and interleaves the result."""
+  def __init__(self,
+               input_dataset,
+               map_func,
+               cycle_length,
+               block_length,
+               num_parallel_calls,
+               deterministic,
+               buffer_output_elements = None,   # backward compatibility with TF2.0
+               prefetch_input_elements = None):  # backward compatibility with TF2.0
+    """See `Dataset.interleave()` for details."""
+    self._input_dataset = input_dataset
+    self._map_func = map_func # StructuredFunctionWrapper
+
+    self._cycle_length = cycle_length
+    self._block_length = block_length
+    self._buffer_output_elements = buffer_output_elements
+    self._prefetch_input_elements = prefetch_input_elements
+    self._num_parallel_calls = num_parallel_calls
+    self._deterministic = deterministic
+
+    if (buffer_output_elements and buffer_output_elements != ds.AUTOTUNE) or \
+       (prefetch_input_elements and prefetch_input_elements != ds.AUTOTUNE):
+      variant_tensor = gen_dataset_ops.parallel_interleave_dataset_v4(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          self._map_func.function.captured_inputs,  # pylint: disable=protected-access
+          self._cycle_length,
+          self._block_length,
+          self._buffer_output_elements,
+          self._prefetch_input_elements,
+          self._num_parallel_calls,
+          f=self._map_func.function,
+          deterministic=deterministic,
+          **self._flat_structure)
+    elif deterministic != "default":
+      variant_tensor = gen_dataset_ops.parallel_interleave_dataset_v3(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          self._map_func.function.captured_inputs,  # pylint: disable=protected-access
+          self._cycle_length,
+          self._block_length,
+          self._num_parallel_calls,
+          f=self._map_func.function,
+          deterministic=deterministic_string,
+          **self._flat_structure)
+    else:
+      variant_tensor = gen_dataset_ops.parallel_interleave_dataset_v2(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          self._map_func.function.captured_inputs,  # pylint: disable=protected-access
+          self._cycle_length,
+          self._block_length,
+          self._num_parallel_calls,
+          f=self._map_func.function,
+          **self._flat_structure)
+    super(TntParallelInterleaveDataset, self).__init__(
+      input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def element_spec(self):
+    return self._map_func.output_structure._element_spec
+
+  def _transformation_name(self):
+    return "Dataset.interleave()"
diff --git a/src/tarantella/datasets/ops/tnt_parallel_map.py b/src/tarantella/datasets/ops/tnt_parallel_map.py
new file mode 100644
index 00000000..9a881676
--- /dev/null
+++ b/src/tarantella/datasets/ops/tnt_parallel_map.py
@@ -0,0 +1,56 @@
+from tensorflow.python.compat import compat
+from tensorflow.python.data.ops import dataset_ops as ds
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import gen_dataset_ops
+
+class TntParallelMapDataset(ds.UnaryDataset):
+  """A `Dataset` that maps a function over the elements in its input."""
+  def __init__(self,
+               input_dataset,
+               map_func,
+               num_parallel_calls,
+               deterministic,
+               use_inter_op_parallelism,
+               preserve_cardinality,
+               use_legacy_function=False):
+    self._input_dataset = input_dataset
+    self._map_func = map_func # StructuredFunctionWrapper
+    self._deterministic = deterministic
+    self._use_inter_op_parallelism = use_inter_op_parallelism
+    self._preserve_cardinality = preserve_cardinality
+
+    if not self._deterministic == "default" or compat.forward_compatible(2020, 3, 6):
+      self._num_parallel_calls = ops.convert_to_tensor(
+          num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
+      variant_tensor = gen_dataset_ops.parallel_map_dataset_v2(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          self._map_func.function.captured_inputs,
+          f=self._map_func.function,
+          num_parallel_calls=self._num_parallel_calls,
+          deterministic=self._deterministic,
+          use_inter_op_parallelism=self._use_inter_op_parallelism,
+          preserve_cardinality=self._preserve_cardinality,
+          **self._flat_structure)
+    else:
+      self._num_parallel_calls = ops.convert_to_tensor(
+          num_parallel_calls, dtype=dtypes.int32, name="num_parallel_calls")
+      variant_tensor = gen_dataset_ops.parallel_map_dataset(
+          input_dataset._variant_tensor,  # pylint: disable=protected-access
+          self._map_func.function.captured_inputs,
+          f=self._map_func.function,
+          num_parallel_calls=self._num_parallel_calls,
+          use_inter_op_parallelism=self._use_inter_op_parallelism,
+          preserve_cardinality=self._preserve_cardinality,
+          **self._flat_structure)
+    super(TntParallelMapDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._map_func]
+
+  @property
+  def element_spec(self):
+    return self._map_func.output_structure
+
+  def _transformation_name(self):
+    return "Dataset.map()"
diff --git a/src/tarantella/model.py b/src/tarantella/model.py
new file mode 100644
index 00000000..fd8a4967
--- /dev/null
+++ b/src/tarantella/model.py
@@ -0,0 +1,392 @@
+import tensorflow as tf
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.callbacks import ModelCheckpoint
+
+import tarantella
+import tarantella.optimizers.synchronous_distributed_optimizer as distributed_optimizers
+import tarantella.datasets.distributed_dataset as ds
+from tarantella import logger
+
+model_implemented_methods = ['model', 'rank', 'comm_size',
+                             'call', 'build', 'done_broadcast', 'set_weights', 'load_weights',
+                             'get_weights', '_broadcast_weights_if_necessary', '_broadcast_weights',
+                             'broadcaster', 'default_shuffle_seed',
+                             'orig_optimizer', 'orig_loss', 'orig_metrics',
+                             'orig_loss_weights', 'orig_sample_weight_mode', 'orig_weighted_metrics']
+
+class Model(tf.keras.models.Model):
+  def __init__(self, model):
+    if not tarantella.global_context:
+      raise RuntimeError("""Cannot initialize a Model before the Tarantella library.
+      Please call "tarantella.init()" first.
+      """)
+    self.rank = tarantella.get_rank()
+    self.comm_size = tarantella.get_size()
+
+    self.model = model
+    self.input_shapes = None
+    self.done_broadcast = False
+    self.compiled = False
+    self.broadcaster = None
+    self.barrier = tarantella.Barrier()
+
+    self.orig_optimizer = None
+    self.orig_loss = None
+    self.orig_metrics = None
+    self.orig_loss_weights = None
+    self.orig_sample_weight_mode = None
+    self.orig_weighted_metrics = None
+
+    self.dist_optimizer = None
+    self.default_shuffle_seed = 42
+
+    # support for TF 2.0 -- 2.3
+    self.tf_default_verbose = {'fit' : 1,
+                               'evaluate' : 1,
+                               'predict' : 0,
+                              }
+
+  def call(self, inputs):
+    return self.model.call(inputs)
+
+  def build(self, input_shape):
+    return self.model.build(input_shape)
+
+  def __getattr__(self, name):
+    if name in model_implemented_methods or \
+       'model' not in self.__dict__:
+      return getattr(self.__dict__, name)
+    return getattr(self.__dict__['model'], name)
+  
+  def __setattr__(self, name, value):
+    if name in model_implemented_methods or \
+       'model' not in self.__dict__:
+      self.__dict__[name] = value
+    else:
+      setattr(self.__dict__['model'], name, value)
+  
+  def __delattr__(self, name):
+    if name in model_implemented_methods or \
+       'model' not in self.__dict__:
+      delattr(self.__dict__, name)
+    delattr(self.__dict__['model'], name)
+
+  def compile(self,
+              optimizer='rmsprop',
+              loss=None,
+              metrics=None,
+              loss_weights=None,
+              sample_weight_mode=None,
+              weighted_metrics=None,
+              **kwargs):
+    self.done_broadcast = False
+    self.compiled = True
+
+    # Store original parameters to save the model later
+    self.orig_optimizer = optimizer
+    self.orig_loss = loss
+    self.orig_metrics = metrics
+    self.orig_loss_weights = loss_weights
+    self.orig_sample_weight_mode = sample_weight_mode
+    self.orig_weighted_metrics = weighted_metrics
+
+    self.dist_optimizer = tarantella.distributed_optimizers.SynchDistributedOptimizer(self.orig_optimizer)
+    return self.model.compile(optimizer = self.dist_optimizer,
+                              loss = self.orig_loss,
+                              metrics = self.orig_metrics,
+                              loss_weights = self.orig_loss_weights,
+                              sample_weight_mode = self.orig_sample_weight_mode,
+                              weighted_metrics = self.orig_weighted_metrics,
+                              **kwargs)
+
+  def fit(self,
+          x = None,
+          y = None,
+          callbacks = None,
+          validation_data = None,
+          tnt_micro_batch_size = None,
+          tnt_validation_micro_batch_size = None,
+          tnt_distribute_dataset = True,
+          tnt_distribute_validation_dataset = True,
+          **kwargs):
+    self._setup_for_execution('fit', x, y, callbacks, kwargs)
+
+    if tnt_distribute_dataset:
+      distributed_x = ds.DistributedDataset(dataset = x,
+                                            num_ranks = self.comm_size,
+                                            rank = self.rank,
+                                            shuffle_seed = self.default_shuffle_seed)
+      x = distributed_x.distribute_dataset_across_ranks(
+            user_micro_batch_size = tnt_micro_batch_size,
+            is_training = True)
+    else:
+      logger.info("Automatic dataset distribution is disabled."
+                  "Make sure the dataset is sharded manually across ranks.")
+
+    # Always switch off shuffling
+    kwargs["shuffle"] = False
+
+    if validation_data:
+      if tnt_distribute_validation_dataset:
+        distributed_validation_data = ds.DistributedDataset(dataset = validation_data,
+                                                            num_ranks = self.comm_size,
+                                                            rank = self.rank,
+                                                            shuffle_seed = self.default_shuffle_seed)
+        validation_data = distributed_validation_data.distribute_dataset_across_ranks(
+              user_micro_batch_size = tnt_validation_micro_batch_size,
+              is_training = False)
+      else:
+        logger.info("Automatic distribution for the validation dataset is disabled.")
+
+    return self.model.fit(x,
+                          validation_data = validation_data,
+                          callbacks = callbacks,
+                          **kwargs)
+    
+  def evaluate(self,
+               x = None,
+               y = None,
+               callbacks = None,
+               tnt_micro_batch_size = None,
+               tnt_distribute_dataset = True,
+               **kwargs):
+    self._setup_for_execution('evaluate', x, y, callbacks, kwargs)
+
+    if tnt_distribute_dataset:
+      test_dataset = ds.DistributedDataset(dataset = x,
+                                          num_ranks = self.comm_size,
+                                          rank = self.rank,
+                                          shuffle_seed = self.default_shuffle_seed)
+      x = test_dataset.distribute_dataset_across_ranks(
+              user_micro_batch_size = tnt_micro_batch_size,
+              is_training = False)
+    else:
+      logger.info("Automatic dataset distribution is disabled.")
+
+    return self.model.evaluate(x, callbacks = callbacks, **kwargs)
+
+  def predict(self,
+              x = None,
+              callbacks = None,
+              tnt_micro_batch_size = None,
+              tnt_distribute_dataset = True,
+              **kwargs):
+    self._setup_for_execution('predict', x, None, callbacks, kwargs)
+
+    if tnt_distribute_dataset:
+      test_dataset = ds.DistributedDataset(dataset = x,
+                                           num_ranks = self.comm_size,
+                                           rank = self.rank,
+                                           shuffle_seed = self.default_shuffle_seed)
+      x = test_dataset.distribute_dataset_across_ranks(
+               user_micro_batch_size = tnt_micro_batch_size,
+               is_training = False)
+    else:
+      logger.info("Automatic dataset distribution is disabled.")
+    return self.model.predict(x, callbacks = callbacks, **kwargs)
+
+  def get_config(self):
+    return self.model.get_config()
+
+  @classmethod
+  def from_config(cls, config):
+    keras_model = tf.keras.Model.from_config(config)
+    return cls(keras_model)
+
+  def to_json(self, **kwargs):
+    return self.model.to_json(**kwargs)
+
+  def to_yaml(self, **kwargs):
+    return self.model.to_yaml(**kwargs)
+
+  def save_weights(self, filepath, tnt_save_all_devices = False, **kwargs):
+    if tnt_save_all_devices:
+      self.model.save_weights(filepath, **kwargs)
+    else:
+      if tarantella.is_master_rank():
+        self.model.save_weights(filepath, **kwargs)
+    # make sure, every rank can load the model after function exit
+    self.barrier.synchronize()
+
+  def load_weights(self, filepath, **kwargs):
+    # loaded weights from the same source will be identical on all ranks
+    self.done_broadcast = True
+    return self.model.load_weights(filepath = filepath, **kwargs)
+  
+  def set_weights(self, weights):
+    self.model.set_weights(weights)
+    self._broadcast_weights()
+    self.done_broadcast = True
+    
+  def get_weights(self):
+    if not self.model.built:
+      if not self.input_shapes:
+        raise RuntimeError("""Cannot get weights before initializition.
+        Please call "tnt.Model.build()" or "tnt.Model.fit()" first.
+        """)
+      self.model.build(self.input_shapes)
+    return self.model.get_weights()
+
+  def save(self, filepath, tnt_save_all_devices = False, **kwargs):
+    if tnt_save_all_devices:
+      self._save(filepath, kwargs)
+    else:
+      if tarantella.is_master_rank():
+        self._save(filepath, kwargs)
+    # make sure, every rank can load the model after function exit
+    self.barrier.synchronize()
+
+  def _save(self, filepath, args_dict):
+    # 1. Re-compile underlying `Keras.model` w/ underlying optimizer
+    self.model.compile(optimizer = self.orig_optimizer,
+                       loss = self.orig_loss,
+                       metrics = self.orig_metrics,
+                       loss_weights = self.orig_loss_weights,
+                       sample_weight_mode = self.orig_sample_weight_mode,
+                       weighted_metrics = self.orig_weighted_metrics)
+
+    # 2. Save the model as `Keras.Model` with standard Keras optimizer
+    self.model.save(filepath = filepath, **args_dict)
+
+    # 3. Re-compile the Tarantella Model
+    self.compile(optimizer = self.orig_optimizer,
+                 loss = self.orig_loss,
+                 metrics = self.orig_metrics,
+                 loss_weights = self.orig_loss_weights,
+                 sample_weight_mode = self.orig_sample_weight_mode,
+                 weighted_metrics = self.orig_weighted_metrics)
+
+  def summary(self, *args, **kwargs):
+    if tarantella.global_tnt_config.output_on_all_devices:
+      self.model.summary(*args, **kwargs)
+    else:
+      if tarantella.is_master_rank():
+        self.model.summary(*args, **kwargs)
+
+  def _setup_for_execution(self, exec_type, x, y, callbacks, args_dict):
+    self._assert_compile_has_been_called()
+    self._set_verbose_all_ranks(exec_type, args_dict)
+    self._validate_datasets(x, y)
+    self._validate_batch_size_argument(exec_type, args_dict)
+    self._set_input_shapes(x)
+    self._broadcast_weights_if_necessary()
+    self._preprocess_callbacks(callbacks)
+
+  def _assert_compile_has_been_called(self):
+    if self.compiled == False:
+      raise RuntimeError("`tnt.Model` has to be compiled first "
+                         "using `tnt.Model.compile`")
+
+  def _set_verbose_all_ranks(self, exec_type, args_dict):
+    if not 'verbose' in args_dict:
+      args_dict['verbose'] = self.tf_default_verbose[exec_type]
+    if not tarantella.global_tnt_config.output_on_all_devices:
+      if not tarantella.is_master_rank():
+        args_dict['verbose'] = 0
+
+  def _validate_datasets(self, x, y):
+    if not isinstance(x, tf.data.Dataset) or not y is None:
+      raise RuntimeError("tnt.Model only supports `tf.data.Dataset`",
+                         "for `x` and `None` for y.")
+
+  def _validate_batch_size_argument(self, exec_type, args_dict):
+    if 'batch_size' in args_dict:
+      raise KeyError("tnt.Model does not support `batch_size` argument in %s" % exec_type)
+
+    if 'validation_batch_size' in args_dict and exec_type == 'fit':
+      raise KeyError("tnt.Model.fit does not support `validation_batch_size` argument")
+
+  def _set_input_shapes(self, dataset):
+    if isinstance(dataset.element_spec, tf.TensorSpec):
+      self.input_shapes = dataset.element_spec.shape
+    elif isinstance(dataset.element_spec[0], tf.TensorSpec): # (input, outputs)
+      self.input_shapes = dataset.element_spec[0].shape
+    else: # ((input0, ..., input_n), outputs)
+      self.input_shapes = [elem_spec.shape for elem_spec in dataset.element_spec[0]]
+
+  def _broadcast_weights_if_necessary(self):
+    if not self.done_broadcast:
+      self._broadcast_weights()
+
+  def _broadcast_weights(self):
+    weights = self.get_weights()
+
+    if not self.broadcaster:
+      self.broadcaster = tarantella.TensorBroadcaster(weights,
+                                                      tarantella.get_master_rank())
+
+    self.broadcaster.broadcast(weights)
+    self.model.set_weights(weights)
+
+    self.done_broadcast = True
+
+  def _preprocess_callbacks(self, callbacks):
+    if callbacks is not None:
+      remove_tensorboard_index = None
+
+      for index, callback in enumerate(callbacks):
+        if isinstance(callback, tf.keras.callbacks.ModelCheckpoint):
+          tnt_callback = TntModelCheckpoint(keras_model_checkpoint = callback,
+                                            underlying_optimizer = self.orig_optimizer,
+                                            distributed_optimizer = self.dist_optimizer)
+          callbacks[index] = tnt_callback
+
+        elif isinstance(callback, tf.keras.callbacks.LearningRateScheduler):
+          if not tarantella.global_tnt_config.output_on_all_devices:
+            if not tarantella.is_master_rank():
+              callback.verbose = 0
+
+        elif isinstance(callback, tf.keras.callbacks.TensorBoard):
+          if tarantella.global_tnt_config.tensorboard_on_all_devices:
+            callback.log_dir += '/rank_{}'.format(self.rank)
+          else:
+            if not tarantella.is_master_rank():
+              remove_tensorboard_index = index
+
+      if remove_tensorboard_index is not None:
+        del callbacks[remove_tensorboard_index]
+
+
+class TntModelCheckpoint(tf.keras.callbacks.ModelCheckpoint):
+  def __init__(self, keras_model_checkpoint, underlying_optimizer, distributed_optimizer):
+    super(TntModelCheckpoint, self).__init__(keras_model_checkpoint.filepath)
+    self.underlying_optimizer = underlying_optimizer
+    self.distributed_optimizer = distributed_optimizer
+
+    # set member variables from ModelCheckpoint instance
+    self.validation_data = keras_model_checkpoint.validation_data
+    self.model = keras_model_checkpoint.model
+    self._chief_worker_only = keras_model_checkpoint._chief_worker_only
+    self._supports_tf_logs = True
+    self.monitor = keras_model_checkpoint.monitor
+    self.filepath = keras_model_checkpoint.filepath
+    self.save_best_only = keras_model_checkpoint.save_best_only
+    self.save_weights_only = keras_model_checkpoint.save_weights_only
+    self.save_freq = keras_model_checkpoint.save_freq
+    self.epochs_since_last_save = keras_model_checkpoint.epochs_since_last_save
+    self._batches_seen_since_last_saving = keras_model_checkpoint._batches_seen_since_last_saving
+    self._last_batch_seen = 0
+    self.load_weights_on_restart = keras_model_checkpoint.load_weights_on_restart
+    self.period = keras_model_checkpoint.period
+    self.monitor_op = keras_model_checkpoint.monitor_op
+    self.best = keras_model_checkpoint.best
+
+    # only master rank should save and thus print messages
+    self.verbose = keras_model_checkpoint.verbose if tarantella.is_master_rank() else 0
+
+  def on_train_begin(self, logs=None):
+    # As of TF 2.3, this only uses `self.model.load_weights`
+    super().on_train_begin(logs)
+
+  def on_train_batch_end(self, batch, logs=None):
+    # set the optimizer to the underlying to save a plain keras model
+    self.model.optimizer = self.underlying_optimizer
+    super().on_train_batch_end(batch, logs)
+    self.model.optimizer = self.distributed_optimizer
+
+  def on_epoch_end(self, epoch, logs=None):
+    # set the optimizer to the underlying to save a plain keras model
+    self.model.optimizer = self.underlying_optimizer
+    super().on_epoch_end(epoch, logs)
+    self.model.optimizer = self.distributed_optimizer
diff --git a/src/tarantella/models.py b/src/tarantella/models.py
new file mode 100644
index 00000000..280126c2
--- /dev/null
+++ b/src/tarantella/models.py
@@ -0,0 +1,41 @@
+import tensorflow as tf
+import tarantella as tnt
+from tarantella import logger
+
+def save_model(model, filepath, **kwargs):
+  if isinstance(model, tnt.Model):
+    logger.info("save model from instance of tnt.Model")
+  elif isinstance(model, tf.keras.Model):
+    logger.info("save model from instance of tf.keras.Model")
+  else:
+    raise ValueError("[tnt.models.save_model] `model` needs to be either",
+                     "a `tf.keras.Model`, or a `tnt.Model`")
+  model.save(filepath, **kwargs)
+
+def load_model(filepath, **kwargs):
+  keras_model = tf.keras.models.load_model(filepath, **kwargs)
+  # FIXME: compile tnt.Model before returning
+  return tnt.Model(keras_model)
+
+def model_from_config(config, **kwargs):
+  return tnt.Model.from_config(config)
+
+def model_from_json(json_string, **kwargs):
+  keras_model = tf.keras.models.model_from_json(json_string, **kwargs)
+  return tnt.Model(keras_model)
+
+def model_from_yaml(yaml_string, **kwargs):
+  keras_model = tf.keras.models.model_from_yaml(yaml_string, **kwargs)
+  return tnt.Model(keras_model)
+
+def clone_model(model, **kwargs):
+  if isinstance(model, tnt.Model):
+    keras_model = tf.keras.models.clone_model(model.model, **kwargs)
+    logger.info("clone model from instance of tnt.Model")
+  elif isinstance(model, tf.keras.Model):
+    keras_model = tf.keras.models.clone_model(model, **kwargs)
+    logger.info("clone model from instance of tf.keras.Model")
+  else:
+    raise ValueError("[tnt.models.clone_model] `model` needs to be either",
+                     "a `tf.keras.Model`, or a `tnt.Model`")
+  return tnt.Model(keras_model)
diff --git a/src/tarantella/optimizers/__init__.py b/src/tarantella/optimizers/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/src/tarantella/optimizers/__init__.py
@@ -0,0 +1 @@
+
diff --git a/src/tarantella/optimizers/optimizer_wrapper.py b/src/tarantella/optimizers/optimizer_wrapper.py
new file mode 100644
index 00000000..3fb6ba79
--- /dev/null
+++ b/src/tarantella/optimizers/optimizer_wrapper.py
@@ -0,0 +1,35 @@
+import tensorflow as tf
+
+class OptimizerWrapper(tf.keras.optimizers.Optimizer):
+  def __init__(self, optimizer, name = None):
+    self.optimizer = optimizer
+
+    # overwrite the name of the inner optimizer
+    if not name is None:
+      self._name = name
+
+  def __getattr__(self, name):
+    return getattr(self.__dict__['optimizer'], name)
+  
+  def __setattr__(self, name, value):
+    if name in ('optimizer'):
+      self.__dict__[name] = value
+    else:
+      setattr(self.__dict__['optimizer'], name, value)
+  
+  def __delattr__(self, name):
+    delattr(self.__dict__['optimizer'], name)
+
+  # implement the missing methods by forwarding them to the inner optimizer implementations
+  def _resource_apply_dense(self, *args, **kwargs):
+    return self.optimizer._resource_apply_dense(*args, **kwargs)
+
+  def _resource_apply_sparse(self, *args, **kwargs):
+    raise NotImplementedError("[OptimizerWrapper] _resource_apply_sparse: Sparse tensors not supported.")
+
+  def _create_slots(self, *args, **kwargs):
+    return self.optimizer._create_slots(*args, **kwargs)
+
+  def get_config(self):
+    return self.optimizer.get_config()
+
diff --git a/src/tarantella/optimizers/synchronous_distributed_optimizer.py b/src/tarantella/optimizers/synchronous_distributed_optimizer.py
new file mode 100644
index 00000000..09c417a7
--- /dev/null
+++ b/src/tarantella/optimizers/synchronous_distributed_optimizer.py
@@ -0,0 +1,41 @@
+import tensorflow as tf
+import numpy as np
+
+import tarantella
+import tarantella.optimizers.optimizer_wrapper as wrapper
+from tnt_tfops import tnt_ops
+
+class SynchDistributedOptimizer(wrapper.OptimizerWrapper):
+  _HAS_AGGREGATE_GRAD = True
+
+  def __init__(self, optimizer, name = None):
+    self.optimizer = optimizer
+    if name is None:
+      name = "SynchDistributedOptimizer"
+    super(self.__class__, self).__init__(optimizer, name = name)
+
+    # add new attributes after the base object has been initialized
+    self.comm = tarantella.SynchCommunicator(tarantella.global_context)
+    self.initialized = False
+
+  # customized gradient reduction method used by `keras.model.fit`
+  # cf. https://github.com/tensorflow/tensorflow/blob/b36436b087bd8e8701ef51718179037cccdfc26e/tensorflow/python/keras/engine/training.py#L2696
+  def _aggregate_gradients(self, grads_and_vars):
+    grads_and_vars = list(grads_and_vars)
+
+    # initialize the SynchCommunicator with gradient tensors
+    if not self.initialized:
+      self.comm.setup_infrastructure(grads_and_vars)
+      self.initialized = True
+
+    reduced_gradients = self.comm.reduce_gradients(grads_and_vars)
+    return reduced_gradients
+
+  # override gradient computation method used in TF2.0/2.1
+  # to enable gradient reduction
+  def get_gradients(self, loss, params):
+    gradients_to_reduce = self.optimizer.get_gradients(loss, params)
+
+    grads_and_vars = zip(gradients_to_reduce, params)
+    reduced_gradients = self._aggregate_gradients(grads_and_vars)
+    return reduced_gradients
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 00000000..a952e190
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,34 @@
+include (add_test_wrappers)
+
+set (include_dirs ${CMAKE_SOURCE_DIR}/src/gpi_comm_lib
+                  ${CMAKE_SOURCE_DIR}/src/gpi_comm_lib/collectives
+                  ${CMAKE_SOURCE_DIR}/src/gpi_comm_lib/collectives/lib)
+
+set(CLEANUP_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/cleanup.sh)
+set(CLEANUP_TEST_NAME gpi_cleanup)
+add_test (NAME ${CLEANUP_TEST_NAME} COMMAND sh ${CLEANUP_SCRIPT})
+set_tests_properties(${CLEANUP_TEST_NAME} PROPERTIES FIXTURES_CLEANUP ${CLEANUP_TEST_NAME})
+
+add_subdirectory(${CMAKE_SOURCE_DIR}/test/collectives)
+add_subdirectory(gpi)
+add_subdirectory(python)
+
+set(localranks_list 1 2 4 5 7)
+tarantella_compile_and_generate_gpi_test(NAME SynchCommunicator
+                LOCALRANKS_LIST "${localranks_list}"
+                TIMEOUT 20
+                SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/SynchCommunicator.cpp
+                LIBRARIES tnt::gpicommlib
+                INCLUDE_DIRECTORIES ${include_dirs})
+
+tarantella_compile_and_generate_gpi_test(NAME ResourceManager
+                LOCALRANKS_LIST "${localranks_list}"
+                SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/ResourceManager.cpp
+                LIBRARIES tnt::gpicommlib
+                INCLUDE_DIRECTORIES ${include_dirs})
+
+tarantella_compile_and_generate_test(NAME TensorFusor
+                TIMEOUT 20
+                SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/TensorFusor.cpp
+                LIBRARIES tnt::gpicommlib
+                INCLUDE_DIRECTORIES ${include_dirs})
diff --git a/test/GlobalContextFixture.hpp b/test/GlobalContextFixture.hpp
new file mode 100644
index 00000000..6a8dfe94
--- /dev/null
+++ b/test/GlobalContextFixture.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "gpi/Context.hpp"
+
+namespace tarantella
+{
+  class GlobalContext
+  {
+    public:
+
+      GlobalContext()
+      {
+        instance() = this;
+      }
+      static GlobalContext*& instance()
+      {
+        static GlobalContext* s_inst = 0;
+        return s_inst;
+      }
+
+      tarantella::GPI::Context gpi_cont;
+  };
+}
diff --git a/test/ResourceManager.cpp b/test/ResourceManager.cpp
new file mode 100644
index 00000000..d902e932
--- /dev/null
+++ b/test/ResourceManager.cpp
@@ -0,0 +1,58 @@
+#include "GlobalContextFixture.hpp"
+#include "gpi/ResourceManager.hpp"
+#include "utilities.hpp"
+
+#include <boost/test/unit_test.hpp>
+
+namespace std
+{
+  std::ostream& operator<< (std::ostream& os, tarantella::GPI::ResourceManager const&)
+  {
+    return os;
+  }
+}
+
+namespace tarantella
+{
+  BOOST_GLOBAL_FIXTURE( GlobalContext );
+
+  BOOST_AUTO_TEST_SUITE(resourcemanager_unit)
+    BOOST_AUTO_TEST_CASE(resourcemanager_require_queue)
+    {
+      auto& context = GlobalContext::instance()->gpi_cont;
+      BOOST_REQUIRE_NO_THROW(context.get_resource_manager().get_queue_id_for_write_notify());
+    }
+
+    BOOST_AUTO_TEST_CASE(resourcemanager_require_group)
+    {
+      auto& context = GlobalContext::instance()->gpi_cont;
+      auto& resource_manager = context.get_resource_manager();
+      BOOST_REQUIRE_NO_THROW(resource_manager.make_group(gen_group_ranks(context.get_comm_size())));
+    }
+
+    BOOST_AUTO_TEST_CASE(resourcemanager_require_notification)
+    {
+      auto& resource_manager = GlobalContext::instance()->gpi_cont.get_resource_manager();
+      GPI::SegmentID segment_id = 1;
+      auto const num_ranks = GlobalContext::instance()->gpi_cont.get_comm_size();
+      auto group_all = resource_manager.make_group(gen_group_ranks(num_ranks));
+      
+      std::size_t segment_size = 10;
+      BOOST_REQUIRE_NO_THROW(resource_manager.make_segment_resources(segment_id, group_all, segment_size));
+      BOOST_REQUIRE_NO_THROW(resource_manager.get_notification_range(segment_id, 2));
+    }
+
+    BOOST_AUTO_TEST_CASE(resourcemanager_require_segment_buffer)
+    {
+      auto& resource_manager = GlobalContext::instance()->gpi_cont.get_resource_manager();
+      GPI::SegmentID segment_id = 2;
+      auto const num_ranks = GlobalContext::instance()->gpi_cont.get_comm_size();
+      auto group_all = resource_manager.make_group(gen_group_ranks(num_ranks));
+
+      std::size_t segment_size = 10;
+      std::size_t buffer_size = 10;
+      BOOST_REQUIRE_NO_THROW(resource_manager.make_segment_resources(segment_id, group_all, segment_size));
+      BOOST_REQUIRE_NO_THROW(resource_manager.get_buffer_of_size(segment_id, buffer_size));
+    }
+  BOOST_AUTO_TEST_SUITE_END()
+}
diff --git a/test/SynchCommunicator.cpp b/test/SynchCommunicator.cpp
new file mode 100644
index 00000000..3024bc10
--- /dev/null
+++ b/test/SynchCommunicator.cpp
@@ -0,0 +1,241 @@
+#include "BufferElementType.hpp"
+#include "distribution/GroupBuilder.hpp"
+#include "distribution/SegmentIDBuilder.hpp"
+#include "GlobalContextFixture.hpp"
+#include "SynchCommunicator.hpp"
+#include "utilities.hpp"
+
+#include <iostream>
+#include <numeric>
+#include <future>
+
+#include <boost/test/unit_test.hpp>
+#include <boost/test/data/test_case.hpp>
+
+using boost::test_tools::per_element;
+
+namespace tarantella
+{
+  BOOST_GLOBAL_FIXTURE( GlobalContext );
+  float const epsilon_f(1e-6);
+
+  std::vector<std::vector<collectives::TensorInfo>> test_cases
+  {
+    { 
+      // test case #1
+      // (tensor_id, num_elements, element_type)
+      {1, 8, collectives::BufferElementType::FLOAT}
+    },
+    { 
+      // test case #2
+      {5, 4 * 1000 , collectives::BufferElementType::FLOAT}
+    },
+    { 
+      // test case #3
+      {42, 17, collectives::BufferElementType::FLOAT},
+      {11, 23, collectives::BufferElementType::FLOAT},
+    },
+    {
+      // test case #4
+      {1, 8, collectives::BufferElementType::FLOAT},
+      {2, 8, collectives::BufferElementType::FLOAT},
+      {3, 8, collectives::BufferElementType::FLOAT},
+      {4, 8, collectives::BufferElementType::FLOAT},
+      {5, 8, collectives::BufferElementType::FLOAT},
+      {6, 9, collectives::BufferElementType::FLOAT},
+    },
+    {
+      // test case #5
+      {1, 4123, collectives::BufferElementType::FLOAT},
+      {2, 5000, collectives::BufferElementType::FLOAT},
+      {3, 6122, collectives::BufferElementType::FLOAT},
+      {4,   17, collectives::BufferElementType::FLOAT},
+      {5, 8000, collectives::BufferElementType::FLOAT},
+      {6, 9145, collectives::BufferElementType::FLOAT},
+    },
+  };
+
+  std::vector<std::size_t> thresholds_bytes
+  {
+    0UL,
+    4UL,
+    64UL,
+    196,
+    1024UL,
+  };
+
+  class SynchCommTestData
+   {
+      public:
+
+        SynchCommTestData(std::vector<collectives::TensorInfo> const& tensor_infos, 
+                          collectives::Allreduce::Operator::ReductionOp op,
+                          std::size_t threshold_bytes = 0UL)
+        : group_builder(GlobalContext::instance()->gpi_cont),
+          segment_id_builder(),
+          synch_comm(GlobalContext::instance()->gpi_cont,
+                     segment_id_builder.get_segment_id(),
+                     group_builder.get_group(),
+                     tensor_infos,
+                     threshold_bytes),
+          expected_output_bufs(tensor_infos.size()),
+          input_bufs(tensor_infos.size()),
+          op(op)
+        {
+          auto& context = GlobalContext::instance()->gpi_cont;
+          auto nranks = context.get_comm_size();
+          auto rank = context.get_rank();
+
+          // generate data for each tensor and fill in the expected result after Allreduce
+          for (auto grad_idx = 0U; grad_idx < tensor_infos.size(); ++grad_idx)
+          {
+            ids.push_back(tensor_infos.at(grad_idx).get_id());
+
+            // create input buffers for Allreduce based on the buffer size specified in the test case
+            std::generate_n(std::back_inserter(input_bufs[grad_idx]), 
+                            tensor_infos.at(grad_idx).get_nelems(), 
+                            [&]() 
+                            { 
+                              // fill buffer with values based on element index and current rank
+                              auto idx = input_bufs[grad_idx].size(); 
+                              return idx * (rank + 1); 
+                            });
+
+            // create expected result buffers and fill them according to the tested Allreduce operation
+            std::generate_n(std::back_inserter(expected_output_bufs[grad_idx]), 
+                            tensor_infos.at(grad_idx).get_nelems(), 
+                            [&]() 
+                            { 
+                              auto idx = expected_output_bufs[grad_idx].size(); 
+                              auto elem = -1.f;
+                              switch (op)
+                              {
+                                case collectives::Allreduce::Operator::ReductionOp::SUM:
+                                {
+                                  elem = idx * nranks * (nranks + 1.) / 2.;
+                                  break;
+                                }
+                                case collectives::Allreduce::Operator::ReductionOp::AVERAGE:
+                                {
+                                  elem = idx * (nranks + 1.) / 2.;
+                                  break;
+                                }
+                                default:
+                                {
+                                  throw std::runtime_error(
+                                    "[Test][SynchCommunicator] Unknown reduction operation");
+                                }
+                              }
+                              return elem;
+                            });
+          }
+        };
+
+        int get_index_for_id(tarantella::GradID id) 
+        {
+          auto it = std::find(ids.begin(), ids.end(), id);
+          if (it == ids.end())
+          {
+            throw std::invalid_argument("ID not found in the list of ids for the current test case");
+          }
+          return distance(ids.begin(), it);
+        }
+
+        distribution::DataParallelGroupBuilder group_builder;  
+        distribution::DataParallelSegmentIDBuilder segment_id_builder;
+        tarantella::SynchCommunicator synch_comm;
+        std::vector<std::vector<float> > expected_output_bufs;
+        std::vector<std::vector<float> > input_bufs;
+        std::vector<tarantella::GradID> ids;
+        collectives::Allreduce::Operator::ReductionOp const op;
+  };
+
+  BOOST_AUTO_TEST_SUITE(synch_communicator_unit)
+
+    BOOST_DATA_TEST_CASE(synch_comm_creation, test_cases, test_case)
+    {
+      distribution::DataParallelGroupBuilder group_builder(GlobalContext::instance()->gpi_cont);
+      distribution::DataParallelSegmentIDBuilder segment_id_builder{};
+
+      BOOST_REQUIRE_NO_THROW(SynchCommunicator synch_comm(GlobalContext::instance()->gpi_cont,
+                                                          segment_id_builder.get_segment_id(),
+                                                          group_builder.get_group(),
+                                                          test_case));
+    }
+
+    BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_f));
+    BOOST_DATA_TEST_CASE(synch_comm_serialized_allred, test_cases * thresholds_bytes, test_case, threshold) // Cartesian product
+    {
+      auto const op = collectives::Allreduce::Operator::ReductionOp::AVERAGE;
+      SynchCommTestData synch_comm_data(test_case, op, threshold);
+
+      for (auto &id : synch_comm_data.ids)
+      {
+        auto input_buf = synch_comm_data.input_bufs.at(synch_comm_data.get_index_for_id(id));
+        synch_comm_data.synch_comm.start_allreduce_impl(id, input_buf.data());
+      }
+
+      for (GradID &id : synch_comm_data.ids)
+      {
+        std::vector<float> out_data(synch_comm_data.expected_output_bufs.at(synch_comm_data.get_index_for_id(id)).size());
+        synch_comm_data.synch_comm.finish_allreduce_impl(id, out_data.data());
+        BOOST_TEST_REQUIRE(out_data == synch_comm_data.expected_output_bufs.at(synch_comm_data.get_index_for_id(id)), per_element());
+      }
+    }
+
+    namespace
+    {
+      void execute_iteration(SynchCommTestData& synch_comm_data)
+      {
+        std::vector<std::future<GradID>> futures;
+
+        // create multiple allreduce calls in parallel
+        for (auto &id : synch_comm_data.ids)
+        {
+          futures.emplace_back(std::async(
+              std::launch::async,
+              [&synch_comm_data](const GradID id) -> GradID {
+                auto input_buf = synch_comm_data.input_bufs.at(synch_comm_data.get_index_for_id(id));
+                synch_comm_data.synch_comm.start_allreduce_impl(id, input_buf.data());
+                return id;
+              },
+              id));
+        }
+        // wait for all allreduce operations to be submitted
+        for (auto &f : futures)
+        {
+          f.get();
+        }
+        // wait for the execution of each allreduce to end and verify result
+        for (auto& id : synch_comm_data.ids)
+        {
+          std::vector<float> out_data(synch_comm_data.expected_output_bufs.at(synch_comm_data.get_index_for_id(id)).size());
+          synch_comm_data.synch_comm.finish_allreduce_impl(id, out_data.data());
+          BOOST_TEST_REQUIRE(out_data == synch_comm_data.expected_output_bufs.at(synch_comm_data.get_index_for_id(id)), 
+                             per_element());
+        }
+      }
+    }
+
+    BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_f));
+    BOOST_DATA_TEST_CASE(synch_comm_parallel_allred, test_cases * thresholds_bytes, test_case, threshold)
+    {
+      auto const op = collectives::Allreduce::Operator::ReductionOp::AVERAGE;
+      SynchCommTestData synch_comm_data(test_case, op, threshold);
+      execute_iteration(synch_comm_data);
+    }
+
+    BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_f));
+    BOOST_DATA_TEST_CASE(synch_comm_repeat_parallel_allred, test_cases * thresholds_bytes, test_case, threshold)
+    {
+      auto const op = collectives::Allreduce::Operator::ReductionOp::AVERAGE;
+      auto nreps = 10UL;
+      SynchCommTestData synch_comm_data(test_case, op, threshold);
+      for (auto i = 0UL; i < nreps; ++i)
+      {
+        execute_iteration(synch_comm_data);
+      }
+    }
+
+  BOOST_AUTO_TEST_SUITE_END()
+}
diff --git a/test/TensorFusor.cpp b/test/TensorFusor.cpp
new file mode 100644
index 00000000..0a96e045
--- /dev/null
+++ b/test/TensorFusor.cpp
@@ -0,0 +1,176 @@
+#include "FusedTensorInfo.hpp"
+#include "utilities.hpp"
+
+#include <boost/test/unit_test.hpp>
+#include <boost/test/data/test_case.hpp>
+
+namespace tarantella
+{
+  namespace collectives
+  {
+    std::vector<std::vector<TensorInfo>> non_fusion_test_cases
+    {
+      { 
+        // test case #1
+        // (tensor_id, num_elements, element_type)
+        {1, 8, BufferElementType::FLOAT}
+      },
+      { 
+        // test case #2
+        {5, 4 * 1000, BufferElementType::FLOAT}
+      },
+      { 
+        // test case #3
+        {5, 10, BufferElementType::FLOAT},
+        {9, 10, BufferElementType::FLOAT}
+      },
+      {
+        // test case #4
+        {1, 8, BufferElementType::FLOAT},
+        {2, 8, BufferElementType::FLOAT},
+        {3, 8, BufferElementType::FLOAT},
+        {4, 8, BufferElementType::FLOAT},
+        {5, 8, BufferElementType::FLOAT},
+        {6, 9, BufferElementType::FLOAT},
+      },
+      {
+        // test case #5
+        {9, 3, BufferElementType::FLOAT},
+        {1, 8, BufferElementType::FLOAT},
+        {0, 9, BufferElementType::FLOAT},
+        {4, 1, BufferElementType::FLOAT},
+        {5, 17, BufferElementType::FLOAT},
+        {6, 5, BufferElementType::FLOAT},
+      },
+    };
+
+    std::vector<TensorInfo> fusion_test_case
+    {
+      {0, 94, BufferElementType::FLOAT},
+      {1, 17, BufferElementType::FLOAT},
+      {2, 2, BufferElementType::FLOAT},
+      {3, 81, BufferElementType::FLOAT},
+    };
+
+    class GetResults
+    {
+      public:
+        GetResults(std::size_t threshold, std::vector<TensorInfo> const& test_case)
+        : id_map{},
+          info_map{}
+        {
+          TensorFusor fusor {threshold};
+          fusor.fuse_tensor_infos_and_ids(test_case, id_map, info_map);
+        }
+
+        TensorFusor::IDMap id_map;
+        TensorFusor::InfoMap info_map;
+    };
+
+    class GetZeroThresholdReferenceResults
+    {
+      public:
+        GetZeroThresholdReferenceResults(std::vector<TensorInfo> const& tensor_infos)
+        : id_map(generate_id_map(tensor_infos)),
+          info_map(generate_info_map(tensor_infos))
+        { }
+
+        TensorFusor::IDMap generate_id_map(std::vector<TensorInfo> const& tensor_infos)
+        {
+          TensorFusor::IDMap map {};
+          for (auto const& tinfo : tensor_infos)
+          {
+            auto const id = tinfo.get_id();
+            map[id] = id;
+          }
+          return map;
+        }
+
+        TensorFusor::InfoMap generate_info_map(std::vector<TensorInfo> const& tensor_infos)
+        {
+          TensorFusor::InfoMap map {};
+          for (auto const& tinfo : tensor_infos)
+          {
+            auto const id = tinfo.get_id();
+            map[id] = tinfo;
+          }
+          return map;
+        }
+
+        TensorFusor::IDMap id_map;
+        TensorFusor::InfoMap info_map;
+    };
+
+    BOOST_AUTO_TEST_SUITE(tensor_fusor_unit)
+      BOOST_DATA_TEST_CASE(tensor_fusor_with_zero_threshold, non_fusion_test_cases, test_case)
+      {
+        GetResults results {0UL, test_case};
+        GetZeroThresholdReferenceResults reference {test_case};
+
+        BOOST_TEST_REQUIRE(results.id_map == reference.id_map);
+        BOOST_TEST_REQUIRE(results.info_map == reference.info_map);
+      }
+
+      BOOST_AUTO_TEST_CASE(tensor_fusor_with_threshold_2_floats)
+      {
+        GetResults results {2UL*4UL, fusion_test_case};
+        GetZeroThresholdReferenceResults reference {fusion_test_case};
+
+        BOOST_TEST_REQUIRE(results.id_map == reference.id_map);
+        BOOST_TEST_REQUIRE(results.info_map == reference.info_map);
+      }
+
+      BOOST_AUTO_TEST_CASE(tensor_fusor_with_threshold_10_floats)
+      {
+        GetResults results {10UL*4UL, fusion_test_case};
+
+        BOOST_TEST_REQUIRE(results.id_map.find(0UL)->second == 0UL);
+        BOOST_TEST_REQUIRE(results.id_map.find(1UL)->second == 1UL);
+        BOOST_TEST_REQUIRE(results.id_map.find(2UL)->second == 2UL);
+        BOOST_TEST_REQUIRE(results.id_map.find(3UL)->second == 2UL);
+
+        BOOST_TEST_REQUIRE(results.info_map.find(0UL)->second.get_nelems() == 94UL);
+        BOOST_TEST_REQUIRE(results.info_map.find(1UL)->second.get_nelems() == 17UL);
+        BOOST_TEST_REQUIRE(results.info_map.find(2UL)->second.get_nelems() == 83UL);
+      }
+
+      BOOST_AUTO_TEST_CASE(tensor_fusor_with_threshold_100_floats)
+      {
+        GetResults results {100UL*4UL, fusion_test_case};
+
+        BOOST_TEST_REQUIRE(results.id_map.find(0UL)->second == 0UL);
+        BOOST_TEST_REQUIRE(results.id_map.find(1UL)->second == 0UL);
+        BOOST_TEST_REQUIRE(results.id_map.find(2UL)->second == 2UL);
+        BOOST_TEST_REQUIRE(results.id_map.find(3UL)->second == 2UL);
+
+        BOOST_TEST_REQUIRE(results.info_map.find(0UL)->second.get_nelems() == 111UL);
+        BOOST_TEST_REQUIRE(results.info_map.find(2UL)->second.get_nelems() == 83UL);
+      }
+
+      BOOST_AUTO_TEST_CASE(tensor_fusor_with_threshold_112_floats)
+      {
+        GetResults results {112UL*4UL, fusion_test_case};
+
+        BOOST_TEST_REQUIRE(results.id_map.find(0UL)->second == 0UL);
+        BOOST_TEST_REQUIRE(results.id_map.find(1UL)->second == 0UL);
+        BOOST_TEST_REQUIRE(results.id_map.find(2UL)->second == 0UL);
+        BOOST_TEST_REQUIRE(results.id_map.find(3UL)->second == 3UL);
+
+        BOOST_TEST_REQUIRE(results.info_map.find(0UL)->second.get_nelems() == 113UL);
+        BOOST_TEST_REQUIRE(results.info_map.find(3UL)->second.get_nelems() == 81UL);
+      }
+
+      BOOST_AUTO_TEST_CASE(tensor_fusor_with_threshold_200_floats)
+      {
+        GetResults results {200UL*4UL, fusion_test_case};
+
+        BOOST_TEST_REQUIRE(results.id_map.find(0UL)->second == 0UL);
+        BOOST_TEST_REQUIRE(results.id_map.find(1UL)->second == 0UL);
+        BOOST_TEST_REQUIRE(results.id_map.find(2UL)->second == 0UL);
+        BOOST_TEST_REQUIRE(results.id_map.find(3UL)->second == 0UL);
+
+        BOOST_TEST_REQUIRE(results.info_map.find(0UL)->second.get_nelems() == 194UL);
+      }
+    BOOST_AUTO_TEST_SUITE_END()
+  }
+}
diff --git a/test/collectives/AllreduceButterfly.cpp b/test/collectives/AllreduceButterfly.cpp
new file mode 100644
index 00000000..b8e6f65f
--- /dev/null
+++ b/test/collectives/AllreduceButterfly.cpp
@@ -0,0 +1,198 @@
+#include "AllreduceTestSetupGenerator.hpp"
+#include "allreduceButterfly.h"
+#include "collectives/barrier/GPIBarrier.hpp"
+#include "GlobalContextFixture.hpp"
+#include "gpi/ResourceManager.hpp"
+#include "utilities.hpp"
+
+#include <boost/test/unit_test.hpp>
+#include <boost/test/data/test_case.hpp>
+
+using boost::test_tools::per_element;
+
+namespace tarantella
+{
+  BOOST_GLOBAL_FIXTURE( GlobalContext );
+
+  float const epsilon_f(1e-6);
+  double const epsilon_d(1e-12);
+
+  // Test cases defining input buffers for Allreduce on a number of ranks given by 
+  // the number of buffers in each test case 
+  std::vector<TestCase> test_cases
+  {
+    { // test case #1 (nelems = 0)
+        {}  // rank0
+    },
+    { // test case #2 (nelems = 1)
+        {1}  // rank0
+    },
+    { // test case #3
+        {2.34, 3, 4, 5, 6}  // rank0
+    },
+    { // test case #4  (nelems > nranks, nelems%nranks == 0)
+        {1, 2, 3, 0.8},  // rank0
+        {0.1, 0.2, 5, 6} // rank1
+    },
+  //  { // test case #5  (nelems = 1)
+  //         {2},  // rank0
+  //         {3},  // rank1
+  //         {4}  // rank2
+  //  },
+    { // test case #6  (nelems > nranks, nelems%nranks >0)
+        {1, 2, 3, 0.8},  // rank0
+        {0.1, 0.2, 5, 6}, // rank1
+        {0.1, 0.2, 5, 6} // rank2
+    },
+    { // test case #7  (nelems == nranks)
+           {1, 3, 4, 5},  // rank0
+           {2, 6, 77, 777},  // rank1
+           {3, 42, 55, 2123},  // rank2
+           {4, 423, 7,  4},  // rank3
+    },
+    { // test case #8 (nelems > nranks, nelems%nranks >0)
+           {1, 3, 4, 5, 1},  // rank0
+           {2, 6, 77, 777, 1},  // rank1
+           {3, 42, 55, 2123, 1},  // rank2
+           {4, 423, 7,  4, 1},  // rank3
+    },
+  //  { // test case #9 (nelems < nranks)
+  //         {1, 3, 4},  // rank0
+  //         {2, 6, 77},  // rank1
+  //         {3, 42, 55},  // rank2
+  //         {4, 423, 7},  // rank3
+  //  },
+    //  { // test case #10 (nelems = 1)
+    //         {1},  // rank0
+    //         {2},  // rank1
+    //         {3},  // rank2
+    //         {4},  // rank3
+    //  },
+  };
+
+  template<AllreduceDataType datatype, AllreduceOp op>
+  void exec_allreduce(tarantella::GPI::Context& context, TestCase const& test_case)
+  {
+    if (context.get_comm_size() < test_case.size())
+    {
+      throw std::logic_error("Allreduce test with fewer processes than required by test case");
+    }
+
+    // allocate group for the number of ranks defined by the test case
+    GPI::Group const group(gen_group_ranks(test_case.size()));
+
+    // resource configuration for the test case
+    gaspi_notification_id_t const first_notification_id = 42;
+    GPI::SegmentID data_segment_id = 1;
+    GPI::SegmentID comm_segment_id = 2;
+    auto const data_segment_size = std::max(size_t(1), test_case[0].size() * getDataTypeSize(datatype));
+    auto const comm_segment_size = std::max(size_t(1),
+                                            collectives::allreduceButterfly::getNumberOfElementsSegmentCommunicate(
+                                              test_case[0].size(), test_case.size()) * getDataTypeSize(datatype));
+    
+    // use new segment manager for each test case and release the resources at the end
+    GPI::SegmentManager segmentmanager(context);
+    
+    if (group.contains_rank(context.get_rank()))
+    {
+      BOOST_REQUIRE_NO_THROW(segmentmanager.create_segment(data_segment_id, group, data_segment_size));
+      BOOST_REQUIRE_NO_THROW(segmentmanager.create_segment(comm_segment_id, group, comm_segment_size));
+    }
+    else
+    {
+      BOOST_REQUIRE_THROW(segmentmanager.create_segment(data_segment_id, group, data_segment_size),
+                          std::runtime_error);
+      BOOST_REQUIRE_THROW(segmentmanager.create_segment(comm_segment_id, group, comm_segment_size),
+                          std::runtime_error);
+    }
+    
+    collectives::Barrier::GPIBarrierAllRanks barrier;
+    barrier.blocking_barrier();
+
+    if (group.contains_rank(context.get_rank()))
+    { 
+      // only processes in the group execute the Allreduce
+      AllreduceTestSetupGenerator<datatype, op> test(context, test_case, 
+                                                    data_segment_id, comm_segment_id,
+                                                    first_notification_id);
+      collectives::allreduceButterfly allreduce(test.input_buf.size(),
+                                                test.get_elem_type(),
+                                                op,
+                                                test.data_seg_buffer,
+                                                test.comm_seg_buffer,
+                                                test.queue_handler,
+                                                group);
+
+      test.copy_data_to_segment(allreduce.getReducePointer());
+      allreduce.signal();
+
+      while (allreduce());
+
+      auto output_buf = test.copy_results_from_segment(allreduce.getReducePointer());
+      BOOST_TEST_REQUIRE(output_buf == test.expected_output_buf, per_element());
+    }
+    else // other processes should not be defined in the test case
+    {
+      BOOST_TEST_REQUIRE(context.get_rank() >= test_case.size());
+    }
+
+    // make sure all processes have finished before cleanup
+    barrier.blocking_barrier();
+  }
+
+  BOOST_AUTO_TEST_SUITE(allreduce_butterfly_unit)
+
+  BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_f));
+  BOOST_DATA_TEST_CASE(simple_allreduce_float_sum, test_cases, test_case)
+  {
+    exec_allreduce<AllreduceDataType::FLOAT, AllreduceOp::SUM>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+
+  BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_f));
+  BOOST_DATA_TEST_CASE(simple_allreduce_float_avg, test_cases, test_case)
+  {
+    exec_allreduce<AllreduceDataType::FLOAT, AllreduceOp::AVERAGE>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+
+  BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_d));
+  BOOST_DATA_TEST_CASE(simple_allreduce_double_sum, test_cases, test_case)
+  {
+    exec_allreduce<AllreduceDataType::DOUBLE, AllreduceOp::SUM>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+
+  BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_d));
+  BOOST_DATA_TEST_CASE(simple_allreduce_double_avg, test_cases, test_case)
+  {
+    exec_allreduce<AllreduceDataType::DOUBLE, AllreduceOp::AVERAGE>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+
+  BOOST_DATA_TEST_CASE(simple_allreduce_int32_sum, test_cases, test_case)
+  {
+    exec_allreduce<AllreduceDataType::INT32, AllreduceOp::SUM>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+
+  BOOST_DATA_TEST_CASE(simple_allreduce_int32_avg, test_cases, test_case)
+  {
+    exec_allreduce<AllreduceDataType::INT32, AllreduceOp::AVERAGE>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+
+  BOOST_DATA_TEST_CASE(simple_allreduce_int16_sum, test_cases, test_case)
+  {
+    exec_allreduce<AllreduceDataType::INT16, AllreduceOp::SUM>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+
+  BOOST_DATA_TEST_CASE(simple_allreduce_int16_avg, test_cases, test_case)
+  {
+    exec_allreduce<AllreduceDataType::INT16, AllreduceOp::AVERAGE>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+  BOOST_AUTO_TEST_SUITE_END()
+}
+
diff --git a/test/collectives/AllreduceButterflyDoubleBuffer.cpp b/test/collectives/AllreduceButterflyDoubleBuffer.cpp
new file mode 100644
index 00000000..77f98d3b
--- /dev/null
+++ b/test/collectives/AllreduceButterflyDoubleBuffer.cpp
@@ -0,0 +1,214 @@
+#include "AllreduceTestSetupGenerator.hpp"
+#include "allreduceButterflyDoubleBuffer.h"
+#include "collectives/barrier/GPIBarrier.hpp"
+#include "GlobalContextFixture.hpp"
+#include "gpi/ResourceManager.hpp"
+
+#include <boost/test/unit_test.hpp>
+#include <boost/test/data/test_case.hpp>
+#include <boost/mpl/list.hpp>
+
+using boost::test_tools::per_element;
+
+namespace tarantella
+{
+  namespace
+  {
+    std::vector<gaspi_rank_t> gen_group_ranks(size_t nranks_in_group)
+    {
+      std::vector<gaspi_rank_t> group_ranks(nranks_in_group);
+      std::iota(group_ranks.begin(), group_ranks.end(), 0);
+      return group_ranks;
+    }
+  }
+  
+  BOOST_GLOBAL_FIXTURE( GlobalContext );
+
+  float const epsilon_f(1e-6);
+  double const epsilon_d(1e-12);
+
+  // Test cases defining input buffers for Allreduce on a number of ranks given by 
+  // the number of buffers in each test case 
+  std::vector<TestCase> test_cases
+  {
+    { // test case #1 (nelems = 0)
+        {}  // rank0
+    },
+    { // test case #2 (nelems = 1)
+        {1}  // rank0
+    },
+    { // test case #3
+        {2.34, 3, 4, 5, 6}  // rank0
+    },
+    { // test case #4  (nelems > nranks, nelems%nranks == 0)
+        {1, 2, 3, 0.8},  // rank0
+        {0.1, 0.2, 5, 6} // rank1
+    },
+  //  { // test case #5  (nelems = 1)
+  //         {2,3},  // rank0
+  //         {3,4},  // rank1
+  //         {4,4}  // rank2
+  //  },
+    { // test case #6  (nelems > nranks, nelems%nranks >0)
+        {1, 2, 3, 0.8},  // rank0
+        {0.1, 0.2, 5, 6}, // rank1
+        {0.1, 0.2, 5, 6} // rank2
+    },
+    { // test case #7  (nelems == nranks)
+           {1, 3, 4, 5},  // rank0
+           {2, 6, 77, 777},  // rank1
+           {3, 42, 55, 2123},  // rank2
+           {4, 423, 7,  4},  // rank3
+    },
+    { // test case #8 (nelems > nranks, nelems%nranks >0)
+           {1, 3, 4, 5, 1},  // rank0
+           {2, 6, 77, 777, 1},  // rank1
+           {3, 42, 55, 2123, 1},  // rank2
+           {4, 423, 7,  4, 1},  // rank3
+    },
+  //  { // test case #9 (nelems < nranks)
+  //         {1, 3, 4},  // rank0
+  //         {2, 6, 77},  // rank1
+  //         {3, 42, 55},  // rank2
+  //         {4, 423, 7},  // rank3
+  //  },
+    //  { // test case #10 (nelems = 1)
+    //         {1},  // rank0
+    //         {2},  // rank1
+    //         {3},  // rank2
+    //         {4},  // rank3
+    //  },
+  };
+
+  template<AllreduceDataType datatype, AllreduceOp op>
+  void exec_allreduce_double_buffer(tarantella::GPI::Context& context, TestCase const& test_case)
+  {
+    if (context.get_comm_size() < test_case.size())
+    {
+      throw std::logic_error("Allreduce test with fewer processes than required by test case");
+    }
+
+    // allocate group for the number of ranks defined by the test case
+    GPI::Group const group(gen_group_ranks(test_case.size()));
+
+    // resource configuration for the test case
+    gaspi_notification_id_t const first_notification_id = 42;
+    GPI::SegmentID data_segment_id0 = 1;
+    GPI::SegmentID data_segment_id1 = 2;
+    GPI::SegmentID comm_segment_id = 3;
+    auto const data_segment_size = std::max(size_t(1), test_case[0].size() * getDataTypeSize(datatype));
+    auto const comm_segment_size = std::max(size_t(1),
+                                            collectives::allreduceButterfly::getNumberOfElementsSegmentCommunicate(
+                                              test_case[0].size(), test_case.size()) * getDataTypeSize(datatype));
+
+    // use new segment manager for each test case and release the resources at the end
+    GPI::SegmentManager segmentmanager(context);
+    
+    if (group.contains_rank(context.get_rank()))
+    {
+      BOOST_REQUIRE_NO_THROW(segmentmanager.create_segment(data_segment_id0, group, data_segment_size));
+      BOOST_REQUIRE_NO_THROW(segmentmanager.create_segment(data_segment_id1, group, data_segment_size));
+      BOOST_REQUIRE_NO_THROW(segmentmanager.create_segment(comm_segment_id, group, comm_segment_size));
+    }
+    else
+    {
+      BOOST_REQUIRE_THROW(segmentmanager.create_segment(data_segment_id0, group, data_segment_size),
+                          std::runtime_error);
+      BOOST_REQUIRE_THROW(segmentmanager.create_segment(data_segment_id1, group, data_segment_size),
+                          std::runtime_error);
+      BOOST_REQUIRE_THROW(segmentmanager.create_segment(comm_segment_id, group, comm_segment_size),
+                          std::runtime_error);
+    }
+
+    collectives::Barrier::GPIBarrierAllRanks barrier;
+    barrier.blocking_barrier();
+
+    if (group.contains_rank(context.get_rank()))
+    { 
+      // only processes in the group execute the Allreduce
+      AllreduceDoubleBufferTestSetupGenerator<datatype, op> test(context, test_case, 
+                                                                 data_segment_id0, data_segment_id1, 
+                                                                 comm_segment_id,
+                                                                 first_notification_id);
+      collectives::allreduceButterflyDoubleBuffer allreduce(test.input_buf.size(),
+                                                            test.get_elem_type(),
+                                                            op,
+                                                            test.data_seg_buffer,
+                                                            test.additional_data_seg_buffer,
+                                                            test.comm_seg_buffer,
+                                                            test.queue_handler,
+                                                            group);
+
+      test.copy_data_to_segment(allreduce.getActiveReducePointer());
+      allreduce.signal();
+
+      while (allreduce());
+
+      auto output_buf = test.copy_results_from_segment(allreduce.getResultsPointer());
+      BOOST_TEST_REQUIRE(output_buf == test.expected_output_buf, per_element());
+    }
+    else // other processes should not be defined in the test case
+    {
+      BOOST_TEST_REQUIRE(context.get_rank() >= test_case.size());
+    }
+
+    // make sure all processes have finished before cleanup
+    barrier.blocking_barrier();
+  }
+
+  BOOST_AUTO_TEST_SUITE(allreduce_butterfly_unit)
+
+  BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_f));
+  BOOST_DATA_TEST_CASE(simple_allreduce_float_sum, test_cases, test_case)
+  {
+    exec_allreduce_double_buffer<AllreduceDataType::FLOAT, AllreduceOp::SUM>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+
+  BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_f));
+  BOOST_DATA_TEST_CASE(simple_allreduce_float_avg, test_cases, test_case)
+  {
+    exec_allreduce_double_buffer<AllreduceDataType::FLOAT, AllreduceOp::AVERAGE>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+
+  BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_d));
+  BOOST_DATA_TEST_CASE(simple_allreduce_double_sum, test_cases, test_case)
+  {
+    exec_allreduce_double_buffer<AllreduceDataType::DOUBLE, AllreduceOp::SUM>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+
+  BOOST_TEST_DECORATOR(*boost::unit_test::tolerance(epsilon_d));
+  BOOST_DATA_TEST_CASE(simple_allreduce_double_avg, test_cases, test_case)
+  {
+    exec_allreduce_double_buffer<AllreduceDataType::DOUBLE, AllreduceOp::AVERAGE>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+
+  BOOST_DATA_TEST_CASE(simple_allreduce_int32_sum, test_cases, test_case)
+  {
+    exec_allreduce_double_buffer<AllreduceDataType::INT32, AllreduceOp::SUM>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+
+  BOOST_DATA_TEST_CASE(simple_allreduce_int32_avg, test_cases, test_case)
+  {
+    exec_allreduce_double_buffer<AllreduceDataType::INT32, AllreduceOp::AVERAGE>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+
+  BOOST_DATA_TEST_CASE(simple_allreduce_int16_sum, test_cases, test_case)
+  {
+    exec_allreduce_double_buffer<AllreduceDataType::INT16, AllreduceOp::SUM>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+
+  BOOST_DATA_TEST_CASE(simple_allreduce_int16_avg, test_cases, test_case)
+  {
+    exec_allreduce_double_buffer<AllreduceDataType::INT16, AllreduceOp::AVERAGE>
+      (GlobalContext::instance()->gpi_cont, test_case);
+  }
+  BOOST_AUTO_TEST_SUITE_END()
+}
+
diff --git a/test/collectives/AllreduceTestSetupGenerator.hpp b/test/collectives/AllreduceTestSetupGenerator.hpp
new file mode 100644
index 00000000..4ca96edb
--- /dev/null
+++ b/test/collectives/AllreduceTestSetupGenerator.hpp
@@ -0,0 +1,157 @@
+#pragma once
+
+#include "allreduceButterfly.h"
+#include "gpi/Context.hpp"
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include <boost/test/unit_test.hpp>
+
+using TestCase = std::vector<std::vector<float>>;
+namespace std
+{
+  std::ostream &operator<<(std::ostream &os, TestCase const &test)
+  {
+    for (auto i = 0U; i < test.size(); ++i)
+    {
+      os << "Data for rank " << i << "/" << test.size() << ": [";
+      for (auto const elem : test[i])
+      {
+        os << elem << " ";
+      }
+      os << "]" << std::endl;
+    }
+    return os;
+  }
+}
+namespace tarantella
+{
+  using AllreduceDataType = collectives::allreduce::dataType;
+  using AllreduceOp = collectives::allreduce::reductionType;
+  // create expected allreduce results buffers for each test case 
+  template<AllreduceDataType T, AllreduceOp op>
+  class AllreduceTestSetupGenerator
+  {
+    // determine the Allreduce element types based on the datatype template parameter
+    using BufferType = typename std::conditional<T == AllreduceDataType::INT32, int32_t,
+                       typename std::conditional<T == AllreduceDataType::INT16, int16_t,
+                       typename std::conditional<T == AllreduceDataType::DOUBLE, double,
+                       float>::type >::type >::type;
+
+    public:
+      AllreduceTestSetupGenerator(tarantella::GPI::Context& ctx, TestCase const& data,
+                                  tarantella::GPI::SegmentID data_segment_id,
+                                  tarantella::GPI::SegmentID comm_segment_id,
+                                  gaspi_notification_id_t first_notification_id)
+          : context(ctx),
+            first_notification_id(first_notification_id),
+            group_size(data.size()),
+            data_seg_buffer({data_segment_id, offset, first_notification_id}),
+            comm_seg_buffer({comm_segment_id, offset, first_notification_id}),
+            input_buf(generate_rank_input_buf(data, context.get_rank())),
+            expected_output_buf(generate_expected_output_buf(data, op))
+      {}
+      virtual ~AllreduceTestSetupGenerator() = default;
+
+      void copy_data_to_segment(void* seg_ptr)
+      {
+        std::memcpy(seg_ptr, input_buf.data(), input_buf.size()*sizeof(BufferType));
+      }
+
+      std::vector<BufferType> copy_results_from_segment(void* seg_ptr)
+      {
+        std::vector<BufferType> output_buf(input_buf.size());
+        std::memcpy(output_buf.data(), seg_ptr, input_buf.size()*sizeof(BufferType));
+        return output_buf;
+      }
+
+      AllreduceDataType get_elem_type() const {return T;};
+
+      GPI::Context& context;
+      size_t const offset = 0;
+      gaspi_notification_id_t const first_notification_id;
+      size_t group_size;
+
+      collectives::allreduceButterfly::segmentBuffer data_seg_buffer;
+      collectives::allreduceButterfly::segmentBuffer comm_seg_buffer;
+      collectives::queues queue_handler;
+      std::vector<BufferType> input_buf;
+      std::vector<BufferType> expected_output_buf;
+
+    private:
+
+      std::vector<BufferType> generate_rank_input_buf(TestCase const& data, gaspi_rank_t const rank)
+      {
+        std::vector<BufferType> in_buf;
+        BOOST_TEST_REQUIRE(rank < group_size);
+        std::transform(data[rank].begin(), data[rank].end(),
+                       std::back_inserter(in_buf),
+                       [](auto elem) { return static_cast<BufferType>(elem); });
+        return in_buf;
+      }
+
+      std::vector<BufferType> generate_expected_output_buf(TestCase const& data,
+          AllreduceOp operation)
+      {
+        std::vector<BufferType> out_buf;
+        switch (operation)
+        {
+          case AllreduceOp::SUM:
+          {
+            out_buf = compute_sum_over_ranks(data);
+            break;
+          }
+          case AllreduceOp::AVERAGE:
+          {
+            out_buf = compute_sum_over_ranks(data);
+            std::transform(out_buf.begin(), out_buf.end(),
+                           out_buf.begin(),
+                           [group_size=group_size](auto elem) { return elem/static_cast<BufferType>(group_size);}
+                           );
+            break;
+          }
+          default:
+          {
+            throw std::runtime_error("[AllreduceTestSetupGenerator] Unknown reduction operation");
+          }
+        }
+        return out_buf;
+      }
+
+      std::vector<BufferType> compute_sum_over_ranks(TestCase const& data)
+      {
+        std::vector<BufferType> out_buf(data.front().size());
+        for (auto const& buffer : data)
+        {
+          std::transform(buffer.begin(), buffer.end(), out_buf.begin(),
+                         out_buf.begin(),
+                         [](auto elem1, auto elem2) {
+                            return static_cast<BufferType>(elem1) + static_cast<BufferType>(elem2);}
+                         );
+        }
+        return out_buf;
+      }
+  };
+
+  template <AllreduceDataType T, AllreduceOp op>
+  class AllreduceDoubleBufferTestSetupGenerator : public AllreduceTestSetupGenerator<T, op>
+  {
+
+    public:
+      AllreduceDoubleBufferTestSetupGenerator(tarantella::GPI::Context& ctx, TestCase const& data,
+                                              tarantella::GPI::SegmentID data_segment_id0,
+                                              tarantella::GPI::SegmentID data_segment_id1,
+                                              tarantella::GPI::SegmentID comm_segment_id,
+                                              gaspi_notification_id_t first_notification_id)
+          : AllreduceTestSetupGenerator<T, op>(ctx, data, 
+                                               data_segment_id0, comm_segment_id,
+                                               first_notification_id),
+            additional_data_seg_buffer({data_segment_id1, this->offset, this->first_notification_id})
+      {}
+      virtual ~AllreduceDoubleBufferTestSetupGenerator() = default;
+
+      collectives::allreduceButterfly::segmentBuffer additional_data_seg_buffer;
+  };
+}
diff --git a/test/collectives/CMakeLists.txt b/test/collectives/CMakeLists.txt
new file mode 100644
index 00000000..3051275f
--- /dev/null
+++ b/test/collectives/CMakeLists.txt
@@ -0,0 +1,26 @@
+
+include (add_test_wrappers)
+
+set (include_dirs ${CMAKE_SOURCE_DIR}/test
+                  ${CMAKE_SOURCE_DIR}/src/gpi_comm_lib
+                  ${CMAKE_SOURCE_DIR}/src/gpi_comm_lib/collectives/lib)
+
+tarantella_compile_and_generate_test(NAME Allreduce_basic
+                SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/allreduce_basic.cpp
+                LIBRARIES tnt::gpicommlib
+                INCLUDE_DIRECTORIES ${include_dirs})
+
+set(localranks_list 8)
+tarantella_compile_and_generate_gpi_test(NAME AllreduceButterfly
+                LOCALRANKS_LIST "${localranks_list}"
+                TIMEOUT 20
+                SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/AllreduceButterfly.cpp
+                LIBRARIES tnt::gpicommlib
+                INCLUDE_DIRECTORIES ${include_dirs})
+
+tarantella_compile_and_generate_gpi_test(NAME AllreduceButterflyDoubleBuffer
+                LOCALRANKS_LIST "${localranks_list}"
+                TIMEOUT 20
+                SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/AllreduceButterflyDoubleBuffer.cpp
+                LIBRARIES tnt::gpicommlib
+                INCLUDE_DIRECTORIES ${include_dirs})
\ No newline at end of file
diff --git a/test/collectives/allreduce_basic.cpp b/test/collectives/allreduce_basic.cpp
new file mode 100644
index 00000000..8130add3
--- /dev/null
+++ b/test/collectives/allreduce_basic.cpp
@@ -0,0 +1,63 @@
+
+#include "AllreduceTestSetupGenerator.hpp"
+#include "GlobalContextFixture.hpp"
+#include "allreduceButterfly.h"
+
+#include <vector>
+
+#include <boost/test/unit_test.hpp>
+#include <boost/test/data/test_case.hpp>
+
+using boost::test_tools::per_element;
+
+struct AllreduceBasicTestCase
+{
+  unsigned long nelems;
+  unsigned long nprocs;
+  unsigned long expected_size_comm_seg;
+  unsigned long expected_nnotifs;
+};
+namespace std
+{
+  std::ostream& operator<<(std::ostream& os, AllreduceBasicTestCase const& test)
+  {
+    os << "Nelems=" << test.nelems << ", Nprocs=" << test.nprocs;
+    os << std::endl;
+    return os;
+  }
+}
+
+namespace tarantella
+{
+    std::vector<AllreduceBasicTestCase> test_cases
+    {
+      // nelems, nprocs, expected_size_comm_buf, expected_nnotifs
+      {       1,      1,                      0,                0},
+      {       5,      1,                      0,                0},
+      {       1,      2,                      1,                1},
+      {       2,      2,                      1,                1},
+      {       7,      2,                      4,                1},
+      {       1,      4,                      3,                2},
+      {       4,      4,                      3,                2},
+    };
+
+    BOOST_AUTO_TEST_SUITE(allreduce_basic_unit)
+
+    BOOST_DATA_TEST_CASE(allreduce_size_segm_comm, test_cases, test_case)
+    {
+      auto nelems_buffer = test_case.nelems;
+      auto nprocs = test_case.nprocs;
+
+      auto nelems_segment_comm = collectives::allreduceButterfly::getNumberOfElementsSegmentCommunicate(
+                                                    nelems_buffer, nprocs) ;
+      BOOST_TEST_REQUIRE(nelems_segment_comm == test_case.expected_size_comm_seg);
+    }
+
+    BOOST_DATA_TEST_CASE(allreduce_nnotifications, test_cases, test_case)
+    {
+      auto nprocs = test_case.nprocs;
+      auto nnotifications = collectives::allreduceButterfly::getNumberOfNotifications(nprocs);
+      BOOST_TEST_REQUIRE(nnotifications == test_case.expected_nnotifs);
+    }
+    BOOST_AUTO_TEST_SUITE_END()
+}
diff --git a/test/gpi/CMakeLists.txt b/test/gpi/CMakeLists.txt
new file mode 100644
index 00000000..e46ceb4f
--- /dev/null
+++ b/test/gpi/CMakeLists.txt
@@ -0,0 +1,44 @@
+
+include (add_test_wrappers)
+include (parse_arguments)
+
+set (include_dirs ${CMAKE_SOURCE_DIR}/src/gpi_comm_lib
+                  ${CMAKE_SOURCE_DIR}/test)
+
+set(localranks_list 1 2 4 5 7)
+tarantella_compile_and_generate_gpi_test(NAME Context
+                LOCALRANKS_LIST "${localranks_list}"
+                SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/Context.cpp                       
+                LIBRARIES tnt::gpicommlib
+                INCLUDE_DIRECTORIES ${include_dirs})
+
+tarantella_compile_and_generate_gpi_test(NAME Group
+                LOCALRANKS_LIST "${localranks_list}"
+                SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/Group.cpp                       
+                LIBRARIES tnt::gpicommlib
+                INCLUDE_DIRECTORIES ${include_dirs})
+
+tarantella_compile_and_generate_gpi_test(NAME GroupManager
+                LOCALRANKS_LIST "${localranks_list}"
+                SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/GroupManager.cpp
+                LIBRARIES tnt::gpicommlib
+                INCLUDE_DIRECTORIES ${include_dirs})
+
+tarantella_compile_and_generate_gpi_test(NAME QueueManager
+                LOCALRANKS_LIST "${localranks_list}"
+                SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/QueueManager.cpp
+                LIBRARIES tnt::gpicommlib
+                INCLUDE_DIRECTORIES ${include_dirs})
+
+tarantella_compile_and_generate_gpi_test(NAME NotificationManager
+                LOCALRANKS_LIST "${localranks_list}"
+                SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/NotificationManager.cpp
+                LIBRARIES tnt::gpicommlib
+                INCLUDE_DIRECTORIES ${include_dirs})
+
+tarantella_compile_and_generate_gpi_test(NAME SegmentManager
+                LOCALRANKS_LIST "${localranks_list}"
+                SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/SegmentManager.cpp
+                LIBRARIES tnt::gpicommlib
+                INCLUDE_DIRECTORIES ${include_dirs})
+                
diff --git a/test/gpi/Context.cpp b/test/gpi/Context.cpp
new file mode 100644
index 00000000..26b570c3
--- /dev/null
+++ b/test/gpi/Context.cpp
@@ -0,0 +1,47 @@
+#include "collectives/barrier/GPIBarrier.hpp"
+#include "GlobalContextFixture.hpp"
+#include "gpi/gaspiCheckReturn.hpp"
+#include "utilities.hpp"
+
+#include <numeric>
+#include <random>
+
+#include <boost/test/unit_test.hpp>
+#include <boost/mpl/list.hpp>
+
+namespace tarantella
+{
+  BOOST_GLOBAL_FIXTURE( GlobalContext );
+  namespace
+  {
+    std::size_t get_num_allocated_segments()
+    {
+      gaspi_number_t allocated_segments_num;
+      tarantella::GPI::gaspiCheckReturn(gaspi_segment_num(&allocated_segments_num),
+                                        "get number of segments");
+      return allocated_segments_num;
+    }
+  }
+
+  BOOST_AUTO_TEST_CASE(gpicontext_comm_size)
+  {
+    BOOST_TEST(GlobalContext::instance()->gpi_cont.get_comm_size() > 0);
+  }
+
+  BOOST_AUTO_TEST_CASE(gpicontext_allocate_segment)
+  {
+    auto &context = GlobalContext::instance()->gpi_cont;
+    GPI::Group group(gen_group_ranks(context.get_comm_size()));
+    GPI::SegmentID segment_id = 0;
+    std::size_t size_in_bytes = 1000;
+    
+    BOOST_REQUIRE_NO_THROW(context.allocate_segment(segment_id, group, size_in_bytes));
+    collectives::Barrier::GPIBarrierAllRanks barrier;
+    barrier.blocking_barrier();
+
+    BOOST_TEST_REQUIRE(get_num_allocated_segments() == 1);
+
+    BOOST_REQUIRE_NO_THROW(context.deallocate_segment(segment_id, group));
+    BOOST_TEST_REQUIRE(get_num_allocated_segments() == 0);
+  }
+}
diff --git a/test/gpi/Group.cpp b/test/gpi/Group.cpp
new file mode 100644
index 00000000..4fb035a1
--- /dev/null
+++ b/test/gpi/Group.cpp
@@ -0,0 +1,98 @@
+#include "GlobalContextFixture.hpp"
+#include "gpi/Group.hpp"
+#include "utilities.hpp"
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <GASPI.h>
+
+#include <boost/test/unit_test.hpp>
+#include <boost/mpl/list.hpp>
+
+namespace tarantella
+{
+  BOOST_GLOBAL_FIXTURE( GlobalContext );
+
+  BOOST_AUTO_TEST_CASE(gpigroup_allocate_one_group)
+  {
+    auto& context = GlobalContext::instance()->gpi_cont;
+    auto nranks_in_group = context.get_comm_size() - 1;
+    if (nranks_in_group > 0)
+    {
+      auto const group_ranks = gen_group_ranks(nranks_in_group);
+      BOOST_REQUIRE_NO_THROW(GPI::Group const group(group_ranks));
+    }
+  }
+
+  BOOST_AUTO_TEST_CASE(gpigroup_allocate_multiple_group_all)
+  {
+    auto &context = GlobalContext::instance()->gpi_cont;
+    auto const group_ranks = gen_group_ranks(context.get_comm_size());
+    GPI::Group const group1(group_ranks);
+    GPI::Group const group2(group_ranks);
+    GPI::Group const group3(group_ranks);
+
+    BOOST_TEST_REQUIRE(group1.get_size() == group_ranks.size());
+    BOOST_TEST_REQUIRE(group2.get_size() == group_ranks.size());
+    BOOST_TEST_REQUIRE(group3.get_size() == group_ranks.size());
+  }
+
+  BOOST_AUTO_TEST_CASE(gpigroup_check_ranks_in_group)
+  {
+    auto &context = GlobalContext::instance()->gpi_cont;
+    
+    auto shuffled_ranks = gen_group_ranks(context.get_comm_size());
+    std::shuffle(shuffled_ranks.begin(), shuffled_ranks.end(), std::mt19937(42));
+
+    size_t const nranks_in_group = context.get_comm_size() / 2;
+    if (nranks_in_group > 0)
+    {
+      auto group_ranks_list(shuffled_ranks);
+      group_ranks_list.resize(nranks_in_group);
+      GPI::Group const group(group_ranks_list);
+
+      for (auto rank : shuffled_ranks)
+      {
+        auto const rank_iter = std::find(group_ranks_list.begin(), group_ranks_list.end(), rank);
+        if (rank_iter != group_ranks_list.end()) // ranks in the `group_ranks_list` should be found in the group
+        {
+          BOOST_TEST_REQUIRE(group.contains_rank(rank));
+        }
+        else
+        {
+          BOOST_TEST_REQUIRE(!group.contains_rank(rank));
+        }
+      }
+    }
+  }
+
+  BOOST_AUTO_TEST_CASE(gpigroup_throw_allocate_empty_group)
+  {
+    std::vector<gaspi_rank_t> group_ranks;
+    BOOST_REQUIRE_THROW(GPI::Group const group(group_ranks), std::runtime_error);
+  }
+
+  BOOST_AUTO_TEST_CASE(gpigroup_multiple_overlapping_groups)
+  {
+    std::vector<std::unique_ptr<GPI::Group>> allocated_groups;
+    auto& context = GlobalContext::instance()->gpi_cont;
+    
+    for (size_t nranks_in_group = 1; nranks_in_group <= context.get_comm_size(); ++nranks_in_group)
+    {
+      auto const group_ranks = gen_group_ranks(nranks_in_group);
+      BOOST_REQUIRE_NO_THROW(allocated_groups.emplace_back(std::make_unique<GPI::Group>(group_ranks)));
+      BOOST_TEST_REQUIRE(allocated_groups.back()->get_size() == nranks_in_group);
+
+      if (context.get_rank() < nranks_in_group) // ranks lower than `nranks_in_group` should belong to the group
+      {
+        BOOST_TEST_REQUIRE(allocated_groups.back()->contains_rank(context.get_rank()));
+      }
+      else  // other ranks should not be part of the group
+      {
+        BOOST_TEST_REQUIRE(!allocated_groups.back()->contains_rank(context.get_rank()));
+      }
+    }
+  }
+}
diff --git a/test/gpi/GroupManager.cpp b/test/gpi/GroupManager.cpp
new file mode 100644
index 00000000..e7c9318c
--- /dev/null
+++ b/test/gpi/GroupManager.cpp
@@ -0,0 +1,61 @@
+#include "GlobalContextFixture.hpp"
+#include "gpi/GroupManager.hpp"
+#include "utilities.hpp"
+
+#include <boost/test/unit_test.hpp>
+
+namespace tarantella
+{
+  BOOST_GLOBAL_FIXTURE( GlobalContext );
+
+  BOOST_AUTO_TEST_SUITE(groupmanager_unit)
+    BOOST_AUTO_TEST_CASE(groupmanager_no_predefined_group)
+    {
+      GPI::GroupManager gmanager;
+
+      auto const& groups = gmanager.get_groups();
+      BOOST_TEST_REQUIRE(groups.size() == 0);
+    }
+
+    BOOST_AUTO_TEST_CASE(groupmanager_create_group)
+    {
+      auto& context = GlobalContext::instance()->gpi_cont;
+      GPI::GroupManager gmanager;
+
+      auto const group = gmanager.create_group(gen_group_ranks(context.get_comm_size()));
+      
+      BOOST_REQUIRE_NO_THROW(gmanager.get_groups());
+    }
+
+    BOOST_AUTO_TEST_CASE(groupmanager_create_empty_group)
+    {
+      GPI::GroupManager gmanager;
+      BOOST_REQUIRE_THROW(gmanager.create_group({}), std::runtime_error);
+    }
+
+    BOOST_AUTO_TEST_CASE(groupmanager_create_multiple_groups)
+    {
+      auto& context = GlobalContext::instance()->gpi_cont;
+      GPI::GroupManager gmanager;
+
+      for (auto group_size = 1UL; group_size <= context.get_comm_size(); ++group_size)
+      {
+        // create group regardless of whether it contains the current rank or not
+        BOOST_REQUIRE_NO_THROW(gmanager.create_group(gen_group_ranks(group_size)));
+        auto const&groups = gmanager.get_groups();
+
+        if (context.get_rank() < group_size)  // groups contain consecutive ranks between [0, group_size)
+        {
+          BOOST_TEST_REQUIRE(groups.back().contains_rank(context.get_rank()));
+        }
+        else
+        {
+          BOOST_TEST_REQUIRE(!groups.back().contains_rank(context.get_rank()));
+        }
+        
+      }
+      auto num_created_groups = context.get_comm_size();
+      BOOST_TEST_REQUIRE(num_created_groups == gmanager.get_groups().size());
+    }
+  BOOST_AUTO_TEST_SUITE_END()
+}
diff --git a/test/gpi/NotificationManager.cpp b/test/gpi/NotificationManager.cpp
new file mode 100644
index 00000000..7b46438a
--- /dev/null
+++ b/test/gpi/NotificationManager.cpp
@@ -0,0 +1,106 @@
+#include "GlobalContextFixture.hpp"
+#include "gpi/NotificationManager.hpp"
+
+#include <GASPI.h>
+
+#include <numeric>
+#include <stdexcept>
+#include <vector>
+
+#include <boost/test/unit_test.hpp>
+
+namespace tarantella
+{
+  BOOST_GLOBAL_FIXTURE(GlobalContext);
+
+  BOOST_AUTO_TEST_SUITE(notificationmanager_unit)
+
+    BOOST_AUTO_TEST_CASE(notificationmanager_simple_range)
+    {
+      GPI::NotificationManager notif_manager;
+      GPI::SegmentID segment_id = 0;
+      notif_manager.register_segment(segment_id);
+
+      std::size_t const num_notifications(10);
+      auto notification_range = notif_manager.get_notification_range(segment_id, num_notifications);
+      BOOST_TEST_REQUIRE(notification_range.first == 0);
+      BOOST_TEST_REQUIRE(notification_range.second == num_notifications);
+    }
+
+    BOOST_AUTO_TEST_CASE(notificationmanager_throw_max_range)
+    {
+      GPI::NotificationManager notif_manager;
+      GPI::SegmentID segment_id = 0;
+      notif_manager.register_segment(segment_id);
+
+      gaspi_number_t max_num_notifications;
+      gaspi_notification_num(&max_num_notifications);
+
+      BOOST_REQUIRE_THROW(notif_manager.get_notification_range(segment_id, max_num_notifications + 1),
+                          std::runtime_error);
+
+      BOOST_REQUIRE_NO_THROW(notif_manager.get_notification_range(segment_id, max_num_notifications));
+    }
+
+    BOOST_AUTO_TEST_CASE(notificationmanager_allow_empty_range)
+    {
+      GPI::NotificationManager notif_manager;
+      GPI::SegmentID segment_id = 0;
+      notif_manager.register_segment(segment_id);
+
+      std::size_t num_notifs = 0;
+      auto notification_range = notif_manager.get_notification_range(segment_id, num_notifs);
+      BOOST_TEST_REQUIRE(notification_range.first == notification_range.second);
+    }
+
+    BOOST_AUTO_TEST_CASE(notificationmanager_consecutive_ranges)
+    {
+      GPI::NotificationManager notif_manager;
+      GPI::SegmentID segment_id = 0;
+      notif_manager.register_segment(segment_id);
+
+      std::vector<std::size_t> const notification_range_sizes{1, 10, 20, 100, 3};
+      std::size_t previous_max_notification(0);
+      GPI::NotificationManager::NotificationRange notification_range;
+      for (auto num_notifs : notification_range_sizes)
+      {
+        notification_range = notif_manager.get_notification_range(segment_id, num_notifs);
+        BOOST_TEST_REQUIRE(notification_range.first == previous_max_notification);
+        BOOST_TEST_REQUIRE(notification_range.second == previous_max_notification + num_notifs);
+
+        previous_max_notification += num_notifs;
+      }
+
+      std::size_t total_num_notifs = std::accumulate(notification_range_sizes.begin(), 
+                                                     notification_range_sizes.end(), 0);
+      BOOST_TEST_REQUIRE(notification_range.second == total_num_notifs);
+    }
+
+    BOOST_AUTO_TEST_CASE(notificationmanager_unregistered_segment)
+    {
+      GPI::NotificationManager notif_manager;
+      GPI::SegmentID segment_id = 1;
+      std::size_t num_notifs = 5;
+
+      BOOST_REQUIRE_THROW(notif_manager.get_notification_range(segment_id, num_notifs),
+                          std::runtime_error);
+    }
+
+    BOOST_AUTO_TEST_CASE(notificationmanager_multiple_segments)
+    {
+      GPI::NotificationManager notif_manager;
+      std::vector<GPI::SegmentID> segment_ids{1,2,3,4,5};
+      std::size_t num_notifs = 5;
+
+      for (auto const segment_id: segment_ids)
+      {
+        notif_manager.register_segment(segment_id);
+      }
+      for (auto const segment_id: segment_ids)
+      {
+        BOOST_REQUIRE_NO_THROW(notif_manager.get_notification_range(segment_id, num_notifs));
+      }
+    }
+
+  BOOST_AUTO_TEST_SUITE_END()
+}
diff --git a/test/gpi/QueueManager.cpp b/test/gpi/QueueManager.cpp
new file mode 100644
index 00000000..0bf363e1
--- /dev/null
+++ b/test/gpi/QueueManager.cpp
@@ -0,0 +1,117 @@
+#include "collectives/barrier/GPIBarrier.hpp"
+#include "GlobalContextFixture.hpp"
+#include "gpi/QueueManager.hpp"
+#include "gpi/Segment.hpp"
+#include "utilities.hpp"
+
+#include <GASPI.h>
+
+#include <boost/test/unit_test.hpp>
+
+namespace tarantella
+{
+  BOOST_GLOBAL_FIXTURE( GlobalContext );
+
+  namespace 
+  {
+    void check_queue_id_valid(GPI::QueueID qid)
+    {
+      gaspi_number_t max_num_queues_allowed;
+      gaspi_queue_max(&max_num_queues_allowed);
+      BOOST_TEST_REQUIRE(qid < max_num_queues_allowed);
+
+      gaspi_number_t queue_max_size;
+      gaspi_number_t queue_size;
+      gaspi_queue_size_max(&queue_max_size);
+      gaspi_queue_size(qid, &queue_size);
+      BOOST_TEST_REQUIRE(queue_size + 2 <= queue_max_size);
+    }
+
+    void write_n_requests_to_neighbor(GPI::QueueManager &qmanager, 
+                                      gaspi_number_t n_requests)
+    {
+      auto &context = GlobalContext::instance()->gpi_cont;
+      GPI::SegmentID segment_id = 0;
+      std::size_t size_in_bytes = 1000;
+      std::size_t offset = 1;
+      std::size_t buffer_size = 1;
+      gaspi_notification_t notif_value = 1;
+      GPI::Rank next_rank = (context.get_rank() + 1) % context.get_comm_size();
+
+      GPI::Group group(gen_group_ranks(context.get_comm_size()));
+      GPI::Segment segment(context, group, segment_id, size_in_bytes);
+
+      collectives::Barrier::GPIBarrierAllRanks barrier;
+      barrier.blocking_barrier();
+
+      auto notif_range = std::make_pair<gaspi_notification_id_t, gaspi_notification_id_t>(0, n_requests);
+      for (auto notif_id = notif_range.first; notif_id < notif_range.second; ++notif_id)
+      {
+        auto const qid = qmanager.get_queue_id_for_write_notify();
+        check_queue_id_valid(qid);
+        gaspi_write_notify(segment.get_id(), offset, next_rank,
+                           segment.get_id(), offset, buffer_size,
+                           notif_id, notif_value,
+                           qid, GASPI_BLOCK);
+      }
+
+      for (auto i = 0UL; i < n_requests; ++i)
+      {
+        gaspi_notification_id_t notif_id;
+        gaspi_notify_waitsome(segment.get_id(),
+                              notif_range.first, notif_range.second - notif_range.first,
+                              &notif_id, GASPI_BLOCK);
+        gaspi_notify_reset(segment.get_id(), notif_id, &notif_value);
+      }
+    }
+  }
+
+   BOOST_AUTO_TEST_SUITE(queuemanager_unit)
+   
+    BOOST_AUTO_TEST_CASE(queuemanager_request_queue)
+    {
+      auto& qmanager = GPI::QueueManager::get_instance();
+
+      auto const qid = qmanager.get_queue_id_for_write_notify();
+      check_queue_id_valid(qid);
+    }
+
+    BOOST_AUTO_TEST_CASE(queuemanager_request_multiple_queues_without_notif)
+    {
+      auto& qmanager = GPI::QueueManager::get_instance();
+      std::size_t nqueues = 100;
+
+      for (auto i = 0UL; i < nqueues; ++i)
+      {
+        auto const qid = qmanager.get_queue_id_for_write_notify();
+        check_queue_id_valid(qid);
+      }
+    }
+
+    BOOST_AUTO_TEST_CASE(queuemanager_use_multiple_queues)
+    {
+      auto& qmanager = GPI::QueueManager::get_instance();
+
+      gaspi_number_t max_queue_size;
+      gaspi_queue_size_max(&max_queue_size);
+
+      gaspi_number_t number_queues;
+      gaspi_queue_num(&number_queues);
+
+      auto const n_requests = 2 * max_queue_size / 2 * number_queues;
+      write_n_requests_to_neighbor(qmanager, n_requests);
+      qmanager.wait_and_flush_queue();
+
+      // all queues should be empty
+      gaspi_number_t num_queues;
+      gaspi_queue_num(&num_queues);
+      for (auto qid = 0UL; qid < num_queues; ++qid)
+      {
+        gaspi_number_t queue_size;
+        gaspi_queue_size(qid, &queue_size);
+        BOOST_TEST_REQUIRE(queue_size == 0);
+      }
+    }
+
+  BOOST_AUTO_TEST_SUITE_END()
+}
diff --git a/test/gpi/SegmentManager.cpp b/test/gpi/SegmentManager.cpp
new file mode 100644
index 00000000..d72fc363
--- /dev/null
+++ b/test/gpi/SegmentManager.cpp
@@ -0,0 +1,212 @@
+#include "collectives/barrier/GPIBarrier.hpp"
+#include "GlobalContextFixture.hpp"
+#include "gpi/gaspiCheckReturn.hpp"
+#include "gpi/SegmentManager.hpp"
+#include "utilities.hpp"
+
+#include <GASPI.h>
+
+#include <cstddef>
+#include <numeric>
+#include <stdexcept>
+#include <vector>
+
+#include <boost/test/unit_test.hpp>
+
+namespace tarantella
+{
+  BOOST_GLOBAL_FIXTURE( GlobalContext );
+
+  namespace
+  {
+    void create_segment_with_id(GPI::Context& context, GPI::SegmentManager& segmentmanager, 
+                                GPI::SegmentID segment_id, std::size_t size_in_bytes)
+    {
+      GPI::Group group(gen_group_ranks(context.get_comm_size()));
+
+      segmentmanager.create_segment(segment_id, group, size_in_bytes);
+      collectives::Barrier::GPIBarrierAllRanks barrier;
+      barrier.blocking_barrier();
+    }
+  }
+
+  BOOST_AUTO_TEST_SUITE(segmentmanager_unit)
+    BOOST_AUTO_TEST_CASE(segmentmanager_create_manager)
+    {
+      auto &context = GlobalContext::instance()->gpi_cont;
+      BOOST_REQUIRE_NO_THROW(GPI::SegmentManager segmentmanager(context));
+    }
+
+    BOOST_AUTO_TEST_CASE(segmentmanager_create_segment)
+    {
+      auto &context = GlobalContext::instance()->gpi_cont;
+      GPI::SegmentID segment_id = 0;
+      std::size_t size_in_bytes = 1000;
+      GPI::SegmentManager segmentmanager(context);
+      BOOST_REQUIRE_NO_THROW(create_segment_with_id(context, segmentmanager, segment_id, size_in_bytes));
+    }
+
+    BOOST_AUTO_TEST_CASE(segmentmanager_create_empty_segment)
+    {
+      auto &context = GlobalContext::instance()->gpi_cont;
+      GPI::SegmentID segment_id = 0;
+      GPI::Group group(gen_group_ranks(context.get_comm_size()));
+      std::size_t size_zero = 0;
+      
+      GPI::SegmentManager segmentmanager(context);
+      BOOST_REQUIRE_THROW(segmentmanager.create_segment(segment_id, group, size_zero),
+                          std::runtime_error);
+    }
+
+    BOOST_AUTO_TEST_CASE(segmentmanager_create_multiple_segments)
+    {
+      auto &context = GlobalContext::instance()->gpi_cont;
+      std::vector<GPI::SegmentID> segment_ids{1,5,6,31};
+      std::size_t size_in_bytes = 1000;
+      GPI::SegmentManager segmentmanager(context);
+
+      for (auto segment_id : segment_ids)
+      {
+        BOOST_REQUIRE_NO_THROW(create_segment_with_id(context, segmentmanager,
+                                                      segment_id, size_in_bytes));
+      }
+
+      gaspi_number_t allocated_segments_num;
+      GPI::gaspiCheckReturn(gaspi_segment_num(&allocated_segments_num),
+                       "get number of segments");
+      BOOST_TEST_REQUIRE(allocated_segments_num == segment_ids.size());
+    }
+
+    BOOST_AUTO_TEST_CASE(segmentmanager_duplicate_segment_id)
+    {
+      auto &context = GlobalContext::instance()->gpi_cont;
+      GPI::SegmentID segment_id = 5;
+      std::size_t size_in_bytes = 1000;
+      GPI::SegmentManager segmentmanager(context);
+
+      BOOST_REQUIRE_NO_THROW(create_segment_with_id(context, segmentmanager,
+                                                    segment_id, size_in_bytes));
+      BOOST_REQUIRE_THROW(create_segment_with_id(context, segmentmanager,
+                                                 segment_id, size_in_bytes),
+                          std::runtime_error);
+    }
+
+    BOOST_AUTO_TEST_CASE(segmentmanager_delete_manager)
+    {
+      gaspi_number_t initially_allocated_segments_num;
+      GPI::gaspiCheckReturn(gaspi_segment_num(&initially_allocated_segments_num),
+                       "get number of segments");
+
+      auto &context = GlobalContext::instance()->gpi_cont;
+      {
+        GPI::SegmentID segment_id = 0;
+        std::size_t size_in_bytes = 1000;
+        GPI::SegmentManager segmentmanager(context);
+        create_segment_with_id(context, segmentmanager, segment_id, size_in_bytes);
+      }
+
+      // segment manager should be out of scope and all segments deallocated
+      gaspi_number_t allocated_segments_num;
+      GPI::gaspiCheckReturn(gaspi_segment_num(&allocated_segments_num),
+                       "get number of segments");
+      BOOST_TEST_REQUIRE(allocated_segments_num == initially_allocated_segments_num);
+    }
+
+    BOOST_AUTO_TEST_CASE(segmentmanager_create_segment_buffer)
+    {
+      auto &context = GlobalContext::instance()->gpi_cont;
+
+      GPI::SegmentID segment_id = 0;
+      GPI::SegmentManager segmentmanager(context);
+      std::size_t const size_in_bytes = 64;
+      create_segment_with_id(context, segmentmanager, segment_id, size_in_bytes);
+
+      std::size_t const expected_first_offset = 0;
+      auto const segment_buffer = segmentmanager.get_buffer_of_size(segment_id, size_in_bytes);
+      BOOST_TEST_REQUIRE(segment_buffer.get_size() == size_in_bytes);
+      BOOST_TEST_REQUIRE(segment_buffer.get_offset() == expected_first_offset);
+      
+      auto const buffer_pointer = reinterpret_cast<std::byte*>(
+                                  context.get_segment_pointer(segment_buffer.get_segment_id()));
+      BOOST_TEST_REQUIRE(reinterpret_cast<std::byte*>(segment_buffer.get_ptr()) == buffer_pointer);
+    }
+
+    BOOST_AUTO_TEST_CASE(segmentmanager_segment_buffer_empty)
+    {
+      auto &context = GlobalContext::instance()->gpi_cont;
+      GPI::SegmentID segment_id = 0;
+      std::size_t size_in_bytes = 1000;
+      GPI::SegmentManager segmentmanager(context);
+      create_segment_with_id(context, segmentmanager, segment_id, size_in_bytes);
+
+      std::size_t const needed_buffer_size_in_bytes = 0;
+      BOOST_REQUIRE_NO_THROW(segmentmanager.get_buffer_of_size(segment_id, needed_buffer_size_in_bytes));
+    }
+
+    BOOST_AUTO_TEST_CASE(segmentmanager_segment_buffer_max_size)
+    {
+      auto &context = GlobalContext::instance()->gpi_cont;
+
+      GPI::SegmentID segment_id = 0;
+      std::size_t size_in_bytes = 1000;
+      GPI::SegmentManager segmentmanager(context);
+      create_segment_with_id(context, segmentmanager, segment_id, size_in_bytes);
+
+      BOOST_REQUIRE_NO_THROW(segmentmanager.get_buffer_of_size(segment_id, size_in_bytes));
+    }
+
+    BOOST_AUTO_TEST_CASE(segmentmanager_segment_buffer_too_large)
+    {
+      auto &context = GlobalContext::instance()->gpi_cont;
+      GPI::SegmentID segment_id = 0;
+      std::size_t size_in_bytes = 1000;
+      GPI::SegmentManager segmentmanager(context);
+      create_segment_with_id(context, segmentmanager, segment_id, size_in_bytes);
+
+      BOOST_REQUIRE_THROW(segmentmanager.get_buffer_of_size(segment_id, size_in_bytes + 1),
+                          std::runtime_error);
+    }
+
+    BOOST_AUTO_TEST_CASE(segmentmanager_segment_buffer_beyond_max_size)
+    {
+      auto &context = GlobalContext::instance()->gpi_cont;
+      GPI::SegmentID segment_id = 0;
+      std::size_t size_in_bytes = 1000;
+      GPI::SegmentManager segmentmanager(context);
+      create_segment_with_id(context, segmentmanager, segment_id, size_in_bytes);
+
+      segmentmanager.get_buffer_of_size(segment_id, size_in_bytes - 1);
+      BOOST_REQUIRE_THROW(segmentmanager.get_buffer_of_size(segment_id, size_in_bytes),
+                          std::runtime_error);
+    }
+
+    BOOST_AUTO_TEST_CASE(segmentmanager_multiple_segment_buffers)
+    {
+      std::vector<std::size_t> const sizes_in_bytes {10, 3, 56, 100, 1};
+      std::size_t total_segment_size_in_bytes = std::accumulate(sizes_in_bytes.begin(), 
+                                                                sizes_in_bytes.end(), 0);
+
+      auto &context = GlobalContext::instance()->gpi_cont;
+      GPI::SegmentID segment_id = 0;
+      GPI::SegmentManager segmentmanager(context);
+      create_segment_with_id(context, segmentmanager, segment_id, total_segment_size_in_bytes);
+
+      std::size_t current_offset = 0;
+      for (auto size_in_bytes : sizes_in_bytes)
+      {
+        auto const segment_buffer = segmentmanager.get_buffer_of_size(segment_id, size_in_bytes);
+        BOOST_TEST_REQUIRE(segment_buffer.get_size() == size_in_bytes);
+        BOOST_TEST_REQUIRE(segment_buffer.get_offset() == current_offset);
+        BOOST_TEST_REQUIRE(segment_buffer.get_segment_id() == segment_id);
+
+        auto const buffer_pointer = reinterpret_cast<std::byte*>(
+                                    context.get_segment_pointer(segment_buffer.get_segment_id())) +
+                                    current_offset;
+        BOOST_TEST_REQUIRE(reinterpret_cast<std::byte*>(segment_buffer.get_ptr()) == buffer_pointer);
+
+        current_offset += size_in_bytes;
+      } 
+    }
+
+  BOOST_AUTO_TEST_SUITE_END()
+}
diff --git a/test/python/CMakeLists.txt b/test/python/CMakeLists.txt
new file mode 100644
index 00000000..b90484c0
--- /dev/null
+++ b/test/python/CMakeLists.txt
@@ -0,0 +1,34 @@
+include (add_test_wrappers)
+
+set(localranks_list 3)
+tarantella_generate_python_gpi_test(NAME WeightsDataParallel
+                                    TEST_FILE ${CMAKE_CURRENT_SOURCE_DIR}/data_parallel_training/weights_test.py
+                                    LOCALRANKS_LIST "${localranks_list}"
+                                    LABELS integration
+                                    TIMEOUT 3600)
+
+set(localranks_list 3 4)
+tarantella_generate_python_gpi_test(NAME AccuracyDataParallel
+                                    TEST_FILE ${CMAKE_CURRENT_SOURCE_DIR}/data_parallel_training/accuracy_test.py
+                                    LOCALRANKS_LIST "${localranks_list}"
+                                    LABELS integration long_running
+                                    TIMEOUT 10000)
+
+set(localranks_list 4)
+tarantella_generate_python_gpi_test(NAME OptimizersDataParallelMNIST
+                                    TEST_FILE ${CMAKE_CURRENT_SOURCE_DIR}/data_parallel_training/optimizers_mnist_test.py
+                                    LOCALRANKS_LIST "${localranks_list}"
+                                    LABELS integration long_running
+	  		                            TIMEOUT 10000)
+
+set(localranks_list 4)
+tarantella_generate_python_gpi_test(NAME OptimizersDataParallelCIFAR
+                                    TEST_FILE ${CMAKE_CURRENT_SOURCE_DIR}/data_parallel_training/optimizers_cifar10_test.py
+                                    LOCALRANKS_LIST "${localranks_list}"
+                                    LABELS integration long_running disabled
+                                    TIMEOUT 10000)
+
+tarantella_generate_python_gpi_test(NAME DistributedDatasets
+                                    LOCALRANKS_LIST "1"
+                                    TEST_FILE ${CMAKE_CURRENT_SOURCE_DIR}/datasets/distributed_dataset.py
+                                    TIMEOUT 3600)
diff --git a/test/python/conftest.py b/test/python/conftest.py
new file mode 100644
index 00000000..2c1fa88d
--- /dev/null
+++ b/test/python/conftest.py
@@ -0,0 +1,36 @@
+import pytest
+import logging
+import os
+
+import tensorflow as tf
+
+@pytest.fixture(scope="session")
+def tarantella_framework():
+  os.environ['TF_CUDNN_DETERMINISTIC']='1'
+
+  import tarantella
+  tarantella.init()
+
+  logging.getLogger().info("init tarantella")
+  yield tarantella  # provide the fixture value
+  logging.getLogger().info("teardown tarantella")
+
+
+
+def pytest_configure(config):
+    # register an additional marker
+    config.addinivalue_line(
+        "markers", "tfversion(version): test to run only on specific tf versions"
+    )
+
+
+def pytest_runtest_setup(item):
+    supported_versions = [mark.args[0] for mark in item.iter_markers(name="tfversion")]
+    if supported_versions:
+      supportedv = None
+      for v in supported_versions:
+        if tf.__version__.startswith(v):
+          supportedv = v
+      if not supportedv:
+        pytest.skip("Test does not support TF{}".format(tf.__version__))
+
diff --git a/test/python/data_parallel_training/accuracy_test.py b/test/python/data_parallel_training/accuracy_test.py
new file mode 100644
index 00000000..c99ef3ad
--- /dev/null
+++ b/test/python/data_parallel_training/accuracy_test.py
@@ -0,0 +1,73 @@
+from models import mnist_models as mnist
+import training_runner as base_runner
+import utilities as util
+import tarantella
+
+
+import tensorflow as tf
+from tensorflow import keras
+import numpy as np
+
+import logging
+import pytest
+
+# Run tests with multiple models as fixtures
+# (reuse the same model for various test parameter combinations)
+@pytest.fixture(scope="class", params=[mnist.fc_model_generator,
+                                       mnist.lenet5_model_generator,
+                                       mnist.sequential_model_generator,
+                                       mnist.subclassed_model_generator,
+                                      ])
+def model_runners(request):
+  tf.random.set_seed(42)
+  tnt_model_runner = base_runner.generate_tnt_model_runner(request.param())
+  tf.random.set_seed(42)
+  reference_model_runner = base_runner.TrainingRunner(request.param())
+  yield tnt_model_runner, reference_model_runner
+
+class TestsDataParallelCompareAccuracy:
+
+  def test_initialization(self, tarantella_framework):
+    assert tarantella_framework
+
+  @pytest.mark.parametrize("micro_batch_size", [32, 61])
+  @pytest.mark.parametrize("number_epochs", [3])
+  @pytest.mark.parametrize("nbatches", [200])
+  def test_compare_accuracy_against_reference(self, tarantella_framework, model_runners,
+                                              micro_batch_size, number_epochs, nbatches):
+    batch_size = micro_batch_size * tarantella_framework.get_size()
+    nsamples = nbatches * batch_size
+
+    tnt_model_runner, reference_model_runner = model_runners
+    # reuse model with its initial weights
+    tnt_model_runner.reset_weights()
+    reference_model_runner.reset_weights()
+
+    # verify that both models have identical weights
+    tnt_initial_weights = tnt_model_runner.get_weights()
+    reference_initial_weights = reference_model_runner.get_weights()
+    util.compare_weights(tnt_initial_weights, reference_initial_weights, 1e-6)
+
+    # train reference model
+    (ref_train_dataset, ref_test_dataset) = util.load_dataset(mnist.load_mnist_dataset,
+                                                              train_size = nsamples,
+                                                              train_batch_size = batch_size,
+                                                              test_size = 10000,
+                                                              test_batch_size = batch_size)
+    reference_model_runner.train_model(ref_train_dataset, number_epochs)
+    reference_loss_accuracy = reference_model_runner.evaluate_model(ref_test_dataset)
+
+    # train Tarantella model
+    (train_dataset, test_dataset) = util.load_dataset(mnist.load_mnist_dataset,
+                                                      train_size = nsamples,
+                                                      train_batch_size = batch_size,
+                                                      test_size = 10000,
+                                                      test_batch_size = batch_size)
+    tnt_model_runner.train_model(train_dataset, number_epochs)
+    tnt_loss_accuracy = tnt_model_runner.evaluate_model(test_dataset)
+
+    rank = tarantella_framework.get_rank()
+    logging.getLogger().info("[Rank %d] Tarantella[loss, accuracy] = %s" % (rank, str(tnt_loss_accuracy)))
+    logging.getLogger().info("[Rank %d] Reference [loss, accuracy] = %s" % (rank, str(reference_loss_accuracy)))
+    assert np.isclose(tnt_loss_accuracy[0], reference_loss_accuracy[0], atol=1e-2) # losses might not be identical
+    assert np.isclose(tnt_loss_accuracy[1], reference_loss_accuracy[1], atol=1e-2)
diff --git a/test/python/data_parallel_training/optimizers_cifar10_test.py b/test/python/data_parallel_training/optimizers_cifar10_test.py
new file mode 100644
index 00000000..c8441e81
--- /dev/null
+++ b/test/python/data_parallel_training/optimizers_cifar10_test.py
@@ -0,0 +1,50 @@
+from models import cifar10_models as cifar
+import training_runner as base_runner
+import utilities as util
+import tarantella
+
+import tensorflow as tf
+from tensorflow import keras
+
+import pytest
+
+# Fixture for CIFAR-10 models
+@pytest.fixture(scope="class", params=[cifar.alexnet_model_generator])
+def cifar_model_runner(request):
+  yield base_runner.generate_tnt_model_runner(request.param())
+
+class TestsDataParallelOptimizersCIFAR10:
+  def test_initialization(self, tarantella_framework):
+    assert tarantella_framework
+
+  @pytest.mark.parametrize("optimizer", [keras.optimizers.Adadelta,
+                                         keras.optimizers.Adagrad,
+                                         keras.optimizers.Adam,
+                                         keras.optimizers.Adamax,
+                                         keras.optimizers.Nadam,
+                                         keras.optimizers.RMSprop,
+                                         keras.optimizers.SGD
+                                        ])
+  @pytest.mark.parametrize("micro_batch_size", [64])
+  @pytest.mark.parametrize("nbatches", [230])
+  @pytest.mark.parametrize("ntest_batches", [40])
+  def test_cifar_alexnet(self, tarantella_framework, cifar_model_runner,
+                         optimizer, micro_batch_size, nbatches):
+    batch_size = micro_batch_size * tarantella_framework.get_size()
+    nsamples = nbatches * batch_size
+    (number_epochs, lr) = cifar.get_hyperparams(optimizer)
+    (train_dataset, test_dataset) = util.load_dataset(cifar.load_cifar_dataset,
+                                                      train_size = nsamples,
+                                                      train_batch_size = batch_size,
+                                                      test_size = 10000,
+                                                      test_batch_size = batch_size)
+    if optimizer.__name__ == 'SGD':
+      cifar_model_runner.compile_model(optimizer(learning_rate=lr, momentum=0.9))
+    else:
+      cifar_model_runner.compile_model(optimizer(learning_rate=lr))
+
+    cifar_model_runner.reset_weights()
+    cifar_model_runner.train_model(train_dataset, number_epochs)
+
+    results = cifar_model_runner.evaluate_model(test_dataset)
+    util.check_accuracy_greater(results[1], 0.5)
diff --git a/test/python/data_parallel_training/optimizers_mnist_test.py b/test/python/data_parallel_training/optimizers_mnist_test.py
new file mode 100644
index 00000000..13182f01
--- /dev/null
+++ b/test/python/data_parallel_training/optimizers_mnist_test.py
@@ -0,0 +1,74 @@
+from models import mnist_models as mnist
+import training_runner as base_runner
+import utilities as util
+import tarantella
+
+import tensorflow as tf
+from tensorflow import keras
+
+import pytest
+
+# Run tests with multiple models as fixtures
+# (reuse the same model for various test parameter combinations)
+# Fixture for MNIST models
+@pytest.fixture(scope="class", params=[mnist.lenet5_model_generator,
+                                       mnist.sequential_model_generator
+                                      ])
+def mnist_model_runner(request):
+  yield base_runner.generate_tnt_model_runner(request.param())
+
+class TestsDataParallelOptimizers:
+  def test_initialization(self, tarantella_framework):
+    assert tarantella_framework
+
+  @pytest.mark.parametrize("optimizer", [keras.optimizers.Adadelta,
+                                         keras.optimizers.Adagrad,
+                                         keras.optimizers.Adam,
+                                         keras.optimizers.Adamax,
+                                         keras.optimizers.Nadam,
+                                         keras.optimizers.RMSprop,
+                                         keras.optimizers.SGD
+                                        ])
+  @pytest.mark.parametrize("micro_batch_size", [64])
+  @pytest.mark.parametrize("nbatches", [230])
+  def test_compare_accuracy_optimizers(self, tarantella_framework, mnist_model_runner,
+                                      optimizer, micro_batch_size, nbatches):
+    batch_size = micro_batch_size * tarantella_framework.get_size()
+    nsamples = nbatches * batch_size
+    (number_epochs, lr) = mnist.get_hyperparams(optimizer)
+    (train_dataset, test_dataset) = util.load_dataset(mnist.load_mnist_dataset,
+                                                      train_size = nsamples,
+                                                      train_batch_size = batch_size,
+                                                      test_size = 10000,
+                                                      test_batch_size = batch_size)
+    mnist_model_runner.compile_model(optimizer(learning_rate=lr))
+    mnist_model_runner.reset_weights()
+    mnist_model_runner.train_model(train_dataset, number_epochs)
+
+    results = mnist_model_runner.evaluate_model(test_dataset)
+    util.check_accuracy_greater(results[1], 0.91)
+
+  @pytest.mark.parametrize("lr", [0.01])
+  @pytest.mark.parametrize("nesterov", [False, True])
+  @pytest.mark.parametrize("momentum", [0.9])
+  @pytest.mark.parametrize("micro_batch_size", [64])
+  @pytest.mark.parametrize("nbatches", [230])
+  @pytest.mark.parametrize("number_epochs", [8])
+  def test_compare_sgd_momentum(self, tarantella_framework, mnist_model_runner,
+                                lr, nesterov, momentum, micro_batch_size, nbatches,
+                                number_epochs):
+    batch_size = micro_batch_size * tarantella_framework.get_size()
+    nsamples = nbatches * batch_size
+    (train_dataset, test_dataset) = util.load_dataset(mnist.load_mnist_dataset,
+                                                      train_size = nsamples,
+                                                      train_batch_size = batch_size,
+                                                      test_size = 10000,
+                                                      test_batch_size = batch_size)
+    mnist_model_runner.compile_model(keras.optimizers.SGD(learning_rate=lr,
+                                                          momentum=momentum,
+                                                          nesterov=nesterov))
+    mnist_model_runner.reset_weights()
+    mnist_model_runner.train_model(train_dataset, number_epochs)
+
+    results = mnist_model_runner.evaluate_model(test_dataset)
+    util.check_accuracy_greater(results[1], 0.91)
diff --git a/test/python/data_parallel_training/weights_test.py b/test/python/data_parallel_training/weights_test.py
new file mode 100644
index 00000000..baa7a1c7
--- /dev/null
+++ b/test/python/data_parallel_training/weights_test.py
@@ -0,0 +1,52 @@
+from models import mnist_models as mnist
+import training_runner as base_runner
+import utilities as util
+import tarantella
+
+import tensorflow as tf
+from tensorflow import keras
+import numpy as np
+import random
+
+import logging
+import pytest
+
+# Run tests with multiple models as fixtures 
+# (reuse the same model for various test parameter combinations)
+@pytest.fixture(scope="class", params=[mnist.lenet5_model_generator,
+                                       mnist.sequential_model_generator,
+                                      ])
+def model_runner(request):
+  yield base_runner.generate_tnt_model_runner(request.param())
+
+class TestsDataParallelCompareWeights:
+
+  def test_initialization(self, tarantella_framework):
+    assert tarantella_framework
+
+  def test_model_initialization(self, model_runner):
+    assert model_runner.model
+
+  @pytest.mark.parametrize("micro_batch_size", [64])
+  @pytest.mark.parametrize("nbatches", [100])
+  @pytest.mark.parametrize("number_epochs", [7])
+  def test_compare_weights_across_ranks(self, tarantella_framework, model_runner,
+                                        micro_batch_size, nbatches, number_epochs):
+    comm_size = tarantella_framework.get_size()
+    batch_size = micro_batch_size * comm_size
+    nsamples = nbatches * batch_size
+
+    (train_dataset, _) = util.load_dataset(mnist.load_mnist_dataset,
+                                           train_size = nsamples,
+                                           train_batch_size = batch_size,
+                                           test_size = 0,
+                                           test_batch_size = batch_size)
+    model_runner.reset_weights()
+    model_runner.train_model(train_dataset, number_epochs)
+    final_weights = model_runner.get_weights()
+
+    # broadcast the weights from the master rank to all the participating ranks
+    model_runner.model._broadcast_weights()
+
+    reference_rank_weights = model_runner.get_weights()
+    util.compare_weights(final_weights, reference_rank_weights, 1e-6)
diff --git a/test/python/datasets/distributed_dataset.py b/test/python/datasets/distributed_dataset.py
new file mode 100644
index 00000000..d583b530
--- /dev/null
+++ b/test/python/datasets/distributed_dataset.py
@@ -0,0 +1,333 @@
+import logging
+import numpy as np
+import pytest
+
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+
+from tarantella.datasets import distributed_dataset as ds
+
+def mnist_as_np_arrays(training_samples):
+  mnist_train_size = 60000
+  assert(training_samples <= mnist_train_size)
+
+  # load given number of samples
+  (x_train_all, y_train_all), _ = keras.datasets.mnist.load_data()
+  x_train = x_train_all[:training_samples]
+  y_train = y_train_all[:training_samples]
+
+  # normalization and reshape
+  x_train = x_train.reshape(training_samples, 28, 28, 1).astype('float32') / 255.
+  y_train = y_train.astype('float32')
+  return (x_train, y_train)
+
+def np_arrays_from_range(training_samples):
+  return (tf.range(training_samples), tf.range(training_samples))
+
+
+def gen_dataset_batch(dataset, batch_size, drop_remainder):
+  dataset = dataset.batch(batch_size, drop_remainder)
+  dataset = dataset.prefetch(buffer_size=2)
+  return dataset
+
+def gen_dataset_multiple_batch(dataset, batch_size, drop_remainder):
+  dataset = dataset.batch(2, drop_remainder = True)
+  dataset = dataset.batch(2, drop_remainder= True)
+  dataset = dataset.batch(batch_size, drop_remainder)
+  return dataset
+
+def gen_dataset_shuffle_batch(dataset, batch_size, drop_remainder):
+  dataset = dataset.shuffle(10, seed=44, reshuffle_each_iteration=True)
+
+  dataset = dataset.batch(batch_size, drop_remainder)
+  dataset = dataset.prefetch(buffer_size=2)
+  return dataset
+
+def gen_dataset_filter(dataset, batch_size, drop_remainder):
+  # Read from multiple files in parallel
+  dataset = dataset.shuffle(10, seed=44, reshuffle_each_iteration=True)
+  
+  def pred(x,y):
+    return x > 100
+  dataset = dataset.filter(predicate = lambda x, y: pred(x,y))
+  dataset = dataset.batch(batch_size, drop_remainder)
+  return dataset
+
+def gen_dataset_flat_map(dataset, batch_size, drop_remainder):
+  dataset = dataset.batch(batch_size = 3, drop_remainder = False)
+
+  # flat map works on batched datasets
+  dataset = dataset.flat_map(lambda x, y: tf.data.Dataset.from_tensor_slices((x, y)))
+
+  dataset = dataset.batch(batch_size, drop_remainder)
+  return dataset
+
+def gen_dataset_interleave(dataset, batch_size, drop_remainder):
+  dataset = dataset.batch(batch_size = 3, drop_remainder = False)
+  dataset = dataset.interleave(map_func = lambda x, y: tf.data.Dataset.from_tensor_slices((x+3, y)),
+                               cycle_length=tf.data.experimental.AUTOTUNE,
+                               block_length=2,
+                               deterministic = True)
+
+  dataset = dataset.batch(batch_size, drop_remainder)
+  return dataset
+
+def gen_dataset_interleave_v1(dataset, batch_size, drop_remainder):
+  dataset = dataset.batch(batch_size = 3, drop_remainder = False)
+  dataset = dataset.interleave(map_func = lambda x, y: tf.data.Dataset.from_tensor_slices((x+3, y)),
+                               cycle_length=tf.data.experimental.AUTOTUNE,
+                               block_length=2)
+  dataset = dataset.batch(batch_size, drop_remainder)
+  return dataset
+
+def gen_dataset_map(dataset, batch_size, drop_remainder):
+  def map_fn(x, y):
+    return x*5, y
+  dataset = dataset.map(lambda x, y: map_fn(x, y),
+                        deterministic = True)
+  dataset = dataset.batch(batch_size, drop_remainder)
+  return dataset
+
+def gen_dataset_map_v1(dataset, batch_size, drop_remainder):
+  def map_fn(x, y):
+    return x*5, y
+  dataset = dataset.map(lambda x, y: map_fn(x, y))
+  dataset = dataset.batch(batch_size, drop_remainder)
+  return dataset
+
+def gen_dataset_padded_batch(dataset, batch_size, drop_remainder):
+  dataset = dataset.map(lambda x, y: tf.fill([4], x))
+  dataset = dataset.padded_batch(batch_size,
+                                 drop_remainder = drop_remainder,
+                                 padded_shapes = 8)
+  dataset = dataset.prefetch(buffer_size=2)
+  return dataset
+
+def gen_dataset_parallel_interleave(dataset, batch_size, drop_remainder):
+  dataset = dataset.batch(batch_size = 3, drop_remainder = False)
+  dataset = dataset.interleave(map_func = lambda x, y: tf.data.Dataset.from_tensor_slices((x+3, y)),
+                               cycle_length=tf.data.experimental.AUTOTUNE,
+                               block_length=2,
+                               num_parallel_calls=4,
+                               deterministic = True)
+
+  dataset = dataset.batch(batch_size, drop_remainder)
+  return dataset
+
+def gen_dataset_parallel_interleave_v1(dataset, batch_size, drop_remainder):
+  dataset = dataset.batch(batch_size = 3, drop_remainder = False)
+  dataset = dataset.interleave(map_func = lambda x, y: tf.data.Dataset.from_tensor_slices((x+3, y)),
+                               cycle_length=tf.data.experimental.AUTOTUNE,
+                               num_parallel_calls=4)
+
+  dataset = dataset.batch(batch_size, drop_remainder)
+  return dataset
+
+def gen_dataset_parallel_map(dataset, batch_size, drop_remainder):
+  dataset = dataset.repeat(2)
+
+  def map_fn(x,y):
+    return x*5, y+x
+  dataset = dataset.map(map_func = lambda x, y: map_fn(x,y),
+                        num_parallel_calls=2,
+                        deterministic=True)
+  dataset = dataset.batch(batch_size, drop_remainder)
+  return dataset
+
+def gen_dataset_parallel_map_v1(dataset, batch_size, drop_remainder):
+  dataset = dataset.repeat(2)
+  def map_fn(x,y):
+    return x*5, y+x
+  dataset = dataset.map(map_func = lambda x, y: map_fn(x,y),
+                        num_parallel_calls=2)
+  dataset = dataset.batch(batch_size, drop_remainder)
+  return dataset
+
+def gen_dataset_io_pipeline(dataset, batch_size, drop_remainder):
+  # Read from multiple files in parallel
+  def parse_fn(x,y):
+    return x,y
+
+  dataset = dataset.map(
+      map_func = lambda x, y: parse_fn(x,y))
+
+  dataset = dataset.cache()
+  # Shuffle samples
+  dataset = dataset.shuffle(1000, seed = 123)
+  dataset = dataset.repeat(2)
+
+  # Set number of samples if specified
+  dataset = dataset.take(batch_size * 3)
+
+  # Preprocess samples (in parallel)
+  dataset = dataset.map(
+      parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+  return dataset
+
+def gen_dataset_concatenate(dataset, batch_size, drop_remainder):
+  dataset = dataset.concatenate(dataset)
+  dataset = dataset.batch(batch_size, drop_remainder)
+  return dataset
+
+def gen_dataset_zip(dataset, batch_size, drop_remainder):
+  dataset = tf.data.Dataset.zip((dataset, dataset))
+  dataset = dataset.batch(batch_size, drop_remainder)
+
+  return dataset
+
+def validate_local_dataset(ref_dataset, local_dataset, micro_batch_size, rank):
+  local_dataset_it = iter(local_dataset)
+  expected_dataset_it = iter(ref_dataset)
+
+  for local_batch, expected_batch in zip(local_dataset_it, expected_dataset_it):
+    # look at the first dataset when datasets are nested (e.g., after zip, or (samples, targets))
+    # TODO: check all elements of the tuples
+    while isinstance(local_batch, tuple):
+      local_batch = local_batch[0]
+      
+    while isinstance(expected_batch, tuple):
+      expected_batch = expected_batch[0]
+      
+    # extract the slice of the reference dataset that corresponds to `rank`
+    expected_micro_batch = expected_batch[rank * micro_batch_size:
+                                          ((rank+1) * micro_batch_size)]
+    assert np.array_equal(local_batch,expected_micro_batch)
+
+  # verify that the two datasets have the same length
+  with pytest.raises(StopIteration):
+    next(local_dataset_it)
+  with pytest.raises(StopIteration):
+    next(expected_dataset_it)
+
+transformation_test_cases = [ gen_dataset_batch,
+                              gen_dataset_shuffle_batch,
+                              gen_dataset_multiple_batch,
+                              gen_dataset_io_pipeline,
+                              gen_dataset_filter,
+                              gen_dataset_flat_map,
+                              pytest.param(gen_dataset_map,
+                                           marks=pytest.mark.tfversion('2.2')),
+                              pytest.param(gen_dataset_map_v1,
+                                           marks=[pytest.mark.tfversion('2.0'),
+                                                  pytest.mark.tfversion('2.1')]),
+                              pytest.param(gen_dataset_interleave,
+                                           marks=pytest.mark.tfversion('2.2')),
+                              pytest.param(gen_dataset_interleave_v1,
+                                           marks=[pytest.mark.tfversion('2.0'),
+                                                  pytest.mark.tfversion('2.1')]),
+                              pytest.param(gen_dataset_parallel_interleave,
+                                           marks=pytest.mark.tfversion('2.2')),
+                              pytest.param(gen_dataset_parallel_interleave_v1,
+                                           marks=[pytest.mark.tfversion('2.0'),
+                                                  pytest.mark.tfversion('2.1')]),
+                              pytest.param(gen_dataset_parallel_map,
+                                           marks=pytest.mark.tfversion('2.2')),
+                              pytest.param(gen_dataset_parallel_map_v1,
+                                           marks=[pytest.mark.tfversion('2.0'),
+                                                  pytest.mark.tfversion('2.1')]),
+                              gen_dataset_padded_batch,
+                              gen_dataset_concatenate,
+                              gen_dataset_zip,
+                              ]
+@pytest.mark.parametrize("apply_transformations", transformation_test_cases)
+@pytest.mark.parametrize("dataset_generator", [np_arrays_from_range])
+@pytest.mark.parametrize("comm_size", [1,3,4])
+@pytest.mark.parametrize("micro_batch_size", [5])
+@pytest.mark.parametrize("num_samples", [91])
+@pytest.mark.parametrize("nepochs", [2])
+def test_with_drop_remainder(apply_transformations, dataset_generator,
+                             comm_size, micro_batch_size, num_samples,
+                             nepochs):
+  batch_size = comm_size * micro_batch_size
+  (x_train, y_train) = dataset_generator(num_samples)
+
+  reference_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+  tnt_dataset =  tf.data.Dataset.from_tensor_slices((x_train, y_train))
+
+  tnt_dataset = apply_transformations(tnt_dataset,
+                                      batch_size = batch_size,
+                                      drop_remainder=True)
+
+  for rank in range(comm_size):   # verify each rank separately
+    # load local dataset for `rank`
+    dist_dataset = ds.DistributedDataset(tnt_dataset,
+                                          num_ranks = comm_size,
+                                          rank = rank)
+    local_dataset = dist_dataset.distribute_dataset_across_ranks()
+    micro_batch_size = dist_dataset.get_microbatch_size(batch_size)
+
+    # rebuild reference dataset each time to prevent
+    # shuffling effects for repeated iterations
+    ref_dataset = apply_transformations(reference_dataset,
+                                        batch_size = batch_size,
+                                        drop_remainder=True)
+    for epoch in range(nepochs):
+      validate_local_dataset(ref_dataset, local_dataset, micro_batch_size, rank)
+
+
+@pytest.mark.parametrize("apply_transformations", transformation_test_cases)
+@pytest.mark.parametrize("dataset_generator", [np_arrays_from_range])
+@pytest.mark.parametrize("comm_size", [1,3,4])
+@pytest.mark.parametrize("micro_batch_size", [5])
+@pytest.mark.parametrize("num_batches", [4])
+@pytest.mark.parametrize("size_final_batch", [0, 1, 6, 11])
+def test_no_drop_remainder(apply_transformations, dataset_generator,
+                           comm_size, micro_batch_size, num_batches,
+                           size_final_batch):
+  batch_size = comm_size * micro_batch_size
+  num_samples = num_batches * batch_size + size_final_batch
+  (x_train, y_train) = dataset_generator(num_samples)
+
+  reference_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+  tnt_dataset =  tf.data.Dataset.from_tensor_slices((x_train, y_train))
+
+  # Dataset should behve like the sequential dataset with `drop_ramainder=True`
+  tnt_dataset = apply_transformations(tnt_dataset,
+                                      batch_size = batch_size,
+                                      drop_remainder=False)
+
+  for rank in range(comm_size):   # verify each rank separately
+    # load local dataset for `rank`
+    dist_dataset = ds.DistributedDataset(tnt_dataset,
+                                          num_ranks = comm_size,
+                                          rank = rank)
+    local_dataset = dist_dataset.distribute_dataset_across_ranks()
+    micro_batch_size = dist_dataset.get_microbatch_size(batch_size)
+
+    # rebuild reference dataset each time to prevent
+    # shuffling effects for repeated iterations
+    ref_dataset = apply_transformations(reference_dataset,
+                                        batch_size = batch_size,
+                                        drop_remainder=True)
+    validate_local_dataset(ref_dataset, local_dataset, micro_batch_size, rank)
+
+
+@pytest.mark.parametrize("apply_transformations", transformation_test_cases)
+@pytest.mark.parametrize("dataset_generator", [np_arrays_from_range])
+@pytest.mark.parametrize("comm_size", [3, 4])
+@pytest.mark.parametrize("micro_batch_size", [5])
+@pytest.mark.parametrize("size_batch_remainder", [1, 7, 11])
+def test_batch_not_multiple_num_ranks(apply_transformations, dataset_generator,
+                                      comm_size, micro_batch_size,
+                                      size_batch_remainder):
+  batch_size = comm_size * micro_batch_size + size_batch_remainder
+  num_samples = 4 * batch_size
+  (x_train, y_train) = dataset_generator(num_samples)
+
+  tnt_dataset =  tf.data.Dataset.from_tensor_slices((x_train, y_train))
+  tnt_dataset = apply_transformations(tnt_dataset,
+                                      batch_size = batch_size,
+                                      drop_remainder=True)
+
+  for rank in range(comm_size):   # verify each rank separately
+    dist_dataset = ds.DistributedDataset(tnt_dataset,
+                                          num_ranks = comm_size,
+                                          rank = rank)
+    # distributing the dataset should fail because the batch size is not a
+    # multiple of the number of ranks
+    with pytest.raises(ValueError):
+      local_dataset = dist_dataset.distribute_dataset_across_ranks()
diff --git a/test/python/models/cifar10_models.py b/test/python/models/cifar10_models.py
new file mode 100644
index 00000000..10bee192
--- /dev/null
+++ b/test/python/models/cifar10_models.py
@@ -0,0 +1,65 @@
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+
+import numpy as np
+import logging
+
+# Optimizer Hyperparameters
+# Dictionary: Optimizer Name: (number_epochs, learning_rate)
+hyperparams_cifar = {'Adadelta': (12, 1),
+                    'Adagrad':   (20, 0.05),
+                    'Adam':      (5,  0.001),
+                    'Adamax':    (10, 0.001),
+                    'Nadam':     (10, 0.0001),
+                    'RMSprop':   (10, 0.001),
+                    'SGD':       (20, 0.01)}
+
+def get_hyperparams(optimizer):
+  opt = optimizer.__name__
+  return hyperparams_cifar.get(opt)
+
+# Load CIFAR-10 dataset
+def load_cifar_dataset(training_samples, validation_samples, test_samples):
+  cifar_train_size = 60000
+  cifar_test_size = 10000
+  assert(training_samples + validation_samples <= cifar_train_size)
+  assert(test_samples <= cifar_test_size)
+
+  # load given number of samples
+  (x_train_all, y_train_all), (x_test_all, y_test_all) = keras.datasets.cifar10.load_data()
+  x_train = x_train_all[:training_samples]
+  y_train = y_train_all[:training_samples]
+  x_val = x_train_all[training_samples:training_samples+validation_samples]
+  y_val = y_train_all[training_samples:training_samples+validation_samples]
+  x_test = x_test_all[:test_samples]
+  y_test = y_test_all[:test_samples]
+
+  # Preprocess the data (these are Numpy arrays)
+  x_train = x_train.reshape(-1, 32, 32, 3).astype('float32') / 255
+  x_test = x_test.reshape(-1, 32, 32, 3).astype('float32') / 255
+  y_train = y_train.astype('float32')
+  y_test = y_test.astype('float32')
+
+  return (x_train, y_train), (x_val, y_val), (x_test, y_test)
+
+def alexnet_model_generator():
+  inputs = keras.Input(shape=(32,32,3,), name='input')
+  x = layers.Conv2D(96, 3, strides=(4, 4), activation='relu')(inputs)
+  x = layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)
+  x = layers.Conv2D(256, 5, padding='same', activation='relu')(x)
+  x = layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)
+  x = layers.Conv2D(384, 3, padding='same', activation='relu')(x)
+  x = layers.Conv2D(384, 3, padding='same', activation='relu')(x)
+  x = layers.Conv2D(256, 3, padding='same', activation='relu')(x)
+  x = layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)
+  x = layers.Flatten()(x)
+  x = layers.Dense(4096, activation='relu')(x)
+  x = layers.Dropout(0.4)(x)
+  x = layers.Dense(4096, activation='relu')(x)
+  x = layers.Dropout(0.4)(x)
+  outputs = layers.Dense(10, activation='softmax')(x)
+  model = keras.Model(inputs=inputs, outputs=outputs)
+  
+  logging.getLogger().info("Initialized AlexNet model")
+  return model
diff --git a/test/python/models/mnist_models.py b/test/python/models/mnist_models.py
new file mode 100644
index 00000000..402cffd5
--- /dev/null
+++ b/test/python/models/mnist_models.py
@@ -0,0 +1,114 @@
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import numpy as np
+
+import logging
+
+# Optimizer Hyperparameters
+# Dictionary: Optimizer Name: (number_epochs, learning_rate)
+hyperparams_mnist = {'Adadelta': (1, 1),
+                    'Adagrad':   (3, 0.01),
+                    'Adam':      (1, 0.001),
+                    'Adamax':    (2, 0.001),
+                    'Nadam':     (1, 0.002),
+                    'RMSprop':   (1, 0.001),
+                    'SGD':       (8, 0.01)}
+
+def get_hyperparams(optimizer):
+  opt = optimizer.__name__
+  return hyperparams_mnist.get(opt)
+
+# Load MNIST dataset
+def load_mnist_dataset(training_samples, validation_samples, test_samples):
+  mnist_train_size = 60000
+  mnist_test_size = 10000
+  assert(training_samples + validation_samples <= mnist_train_size)
+  assert(test_samples <= mnist_test_size)
+
+  # load given number of samples
+  (x_train_all, y_train_all), (x_test_all, y_test_all) = keras.datasets.mnist.load_data()
+  x_train = x_train_all[:training_samples]
+  y_train = y_train_all[:training_samples]
+  x_val = x_train_all[training_samples:training_samples+validation_samples]
+  y_val = y_train_all[training_samples:training_samples+validation_samples]
+  x_test = x_test_all[:test_samples]
+  y_test = y_test_all[:test_samples]
+
+  # normalization and reshape
+  x_train = x_train.reshape(training_samples, 28, 28, 1).astype('float32') / 255.
+  x_val = x_val.reshape(validation_samples, 28, 28, 1).astype('float32') / 255.
+  x_test = x_test.reshape(test_samples, 28, 28, 1).astype('float32') / 255.
+  y_train = y_train.astype('float32')
+  y_val = y_val.astype('float32')
+  y_test = y_test.astype('float32')
+
+  return (x_train, y_train), (x_val, y_val), (x_test, y_test)
+
+def fc_model_generator():
+  inputs = keras.Input(shape=(28,28,1,), name='input')
+  x = layers.Flatten()(inputs)
+  x = layers.Dense(200, activation='relu', name='FC1')(x)
+  x = layers.Dense(200, activation='relu', name='FC2')(x)
+  outputs = layers.Dense(10, activation='softmax', name='softmax')(x)
+  model = keras.Model(inputs=inputs, outputs=outputs)
+  logging.getLogger().info("Initialized FC model")
+  return model
+
+def lenet5_model_generator():
+  inputs = keras.Input(shape=(28,28,1,), name='input')
+  x = layers.Conv2D(20, 5, padding="same", activation='relu')(inputs)
+  x = layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)
+  x = layers.Conv2D(50, 5, padding="same", activation='relu')(x)
+  x = layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2))(x)
+  x = layers.Flatten()(x)
+  x = layers.Dense(500, activation='relu')(x)
+  outputs = layers.Dense(10, activation='softmax')(x)
+  model = keras.Model(inputs=inputs, outputs=outputs)
+  logging.getLogger().info("Initialized LeNet5 model")
+  return model
+
+def sequential_model_generator():
+  model = keras.Sequential()
+  model.add(keras.layers.Flatten(input_shape=(28,28,1,)))
+  model.add(layers.Dense(200, activation='relu', name='FC1'))
+  model.add(layers.Dense(200, activation='relu', name='FC2'))
+  model.add(layers.Dense(10, activation='softmax', name='softmax'))
+
+  logging.getLogger().info("Initialized Sequential model")
+  return model
+
+def alexnet_model_generator():
+  inputs = keras.Input(shape=(28,28,1,), name='input')
+  x = layers.Conv2D(32, 3, strides=(1, 1), padding='valid', activation='relu')(inputs)
+  x = layers.MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='valid')(x)
+  x = layers.Conv2D(32, 3, strides=(1, 1), padding='valid', activation='relu')(x)
+  x = layers.MaxPooling2D(pool_size=(3, 3), strides=(1, 1), padding='valid')(x)
+  x = layers.Conv2D(64, 3, strides=(1, 1), padding='valid', activation='relu')(x)
+  x = layers.Conv2D(64, 3, strides=(1, 1), padding='valid', activation='relu')(x)
+  x = layers.MaxPooling2D(pool_size=(3, 3), strides=(2, 2), padding='valid')(x)
+  x = layers.Flatten()(x)
+  x = layers.Dense(512, activation='relu')(x)
+  outputs = layers.Dense(10, activation='softmax')(x)
+  model = keras.Model(inputs=inputs, outputs=outputs)
+
+  logging.getLogger().info("Initialized AlexNet model")
+  return model
+
+class SubclassedModel(tf.keras.Model):
+  def __init__(self):
+    super(SubclassedModel, self).__init__()
+    self.flatten = keras.layers.Flatten(input_shape=(28,28,1,))
+    self.dense = keras.layers.Dense(200, activation='relu', name='FC')
+    self.classifier = keras.layers.Dense(10, activation='softmax', name='softmax')
+    logging.getLogger().info("Initialized SubclassedModel")
+
+  def call(self, inputs):
+    x = self.flatten(inputs)
+    x = self.dense(x)
+    return self.classifier(x)
+
+def subclassed_model_generator():
+  model = SubclassedModel()
+  model.build((None,28,28,1))
+  return model
\ No newline at end of file
diff --git a/test/python/pytest.ini b/test/python/pytest.ini
new file mode 100644
index 00000000..86d2db5e
--- /dev/null
+++ b/test/python/pytest.ini
@@ -0,0 +1,7 @@
+[pytest]
+filterwarnings =
+    ignore::DeprecationWarning
+    ignore::PendingDeprecationWarning
+
+log_cli=true
+log_level=2
diff --git a/test/python/training_runner.py b/test/python/training_runner.py
new file mode 100644
index 00000000..757033aa
--- /dev/null
+++ b/test/python/training_runner.py
@@ -0,0 +1,45 @@
+import tensorflow as tf
+from tensorflow import keras
+
+import tarantella as tnt
+
+def generate_tnt_model_runner(model):
+  model_data_par = tnt.Model(model)
+  runner = TrainingRunner(model_data_par)
+  return runner
+
+# Wrap tarantella model creation and compiling, as they should be executed only once
+class TrainingRunner:
+  def __init__(self, model):
+    self.learning_rate = 0.001
+    self.optimizer = keras.optimizers.Adam(learning_rate=self.learning_rate)
+    self.loss = keras.losses.SparseCategoricalCrossentropy()
+    self.metric = keras.metrics.SparseCategoricalAccuracy()
+    self.model = model
+
+    self.compile_model(self.optimizer)
+    self.initial_weights = model.get_weights()
+
+  def compile_model(self, optimizer):
+    self.model.compile(optimizer=optimizer,
+                loss=self.loss,
+                metrics=[self.metric],
+                experimental_run_tf_function=False)
+
+  def train_model(self, train_dataset, number_epochs):
+    self.model.fit(train_dataset,
+                   epochs = number_epochs,
+                   verbose = 0,
+                   shuffle = False)
+
+  def get_weights(self):
+    return self.model.get_weights()
+
+  def reset_weights(self):
+    self.model.set_weights(self.initial_weights)
+
+  def evaluate_model(self, val_dataset):
+    #return_dict to be added here (support only from tf 2.2)
+    results = self.model.evaluate(val_dataset, verbose=0)
+    return results
+
diff --git a/test/python/utilities.py b/test/python/utilities.py
new file mode 100644
index 00000000..eef93252
--- /dev/null
+++ b/test/python/utilities.py
@@ -0,0 +1,35 @@
+import datetime
+import tensorflow as tf
+import numpy as np
+import logging
+
+def create_dataset_from_arrays(samples, labels, batch_size):
+  assert(len(samples) == len(labels))
+  ds = tf.data.Dataset.from_tensor_slices((samples, labels))
+  return ds.batch(batch_size)
+
+def load_dataset(dataset_loader,
+                 train_size, train_batch_size,
+                 test_size, test_batch_size):
+  shuffle_seed = current_date()
+
+  (x_train, y_train), (x_val, y_val), (x_test, y_test) = dataset_loader(train_size, 0, test_size)
+  train_dataset = create_dataset_from_arrays(x_train, y_train, train_batch_size)
+  test_dataset = create_dataset_from_arrays(x_test, y_test, test_batch_size)
+
+  train_dataset = train_dataset.shuffle(len(x_train), shuffle_seed, reshuffle_each_iteration = True)
+  return (train_dataset, test_dataset)
+
+def current_date():
+  date = datetime.datetime.now()
+  return int(date.strftime("%Y%m%d"))
+
+def check_accuracy_greater(accuracy, acc_value):
+  logging.getLogger().info("Test accuracy: " % accuracy)
+  assert accuracy > acc_value
+
+def compare_weights(weights1, weights2, tolerance):
+  wtocompare = list(zip(weights1, weights2))
+  for (tensor1, tensor2) in wtocompare:
+    assert np.allclose(tensor1, tensor2, atol=tolerance)
+
diff --git a/test/utilities.hpp b/test/utilities.hpp
new file mode 100644
index 00000000..3e02300d
--- /dev/null
+++ b/test/utilities.hpp
@@ -0,0 +1,28 @@
+#include "collectives/TensorInfo.hpp"
+#include "gpi/Types.hpp"
+
+#include <numeric>
+#include <vector>
+
+namespace tarantella
+{
+  std::vector<GPI::Rank> gen_group_ranks(size_t nranks_in_group)
+  {
+    std::vector<GPI::Rank> group_ranks(nranks_in_group);
+    std::iota(group_ranks.begin(), group_ranks.end(), 0);
+    return group_ranks;
+  }
+}
+
+namespace std
+{
+  std::ostream& operator<< (std::ostream& os, const std::vector<tarantella::collectives::TensorInfo>& tlist)
+  {
+    for (auto& tinfo : tlist)
+    {
+      os << "TensorID=" << tinfo.get_id() << " nelems=" << tinfo.get_nelems()
+         << " dtype_size=" << getDataTypeSize(tinfo.get_elem_type())<< std::endl;
+    }
+    return os;
+  }
+}
\ No newline at end of file