alpaka-group · psychocoderHPC · Aug 31, 2017 · Aug 11, 2017 · Aug 28, 2017 · Aug 28, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -711,6 +711,7 @@ script:
     - if [ "${ALPAKA_CI_ANALYSIS}" == "OFF" ] ;then ./travis/compileExec.sh "test/unit/mem/buf/" ./memBuf ;fi
     - if [ "${ALPAKA_CI_ANALYSIS}" == "OFF" ] ;then ./travis/compileExec.sh "test/unit/mem/view/" ./memView ;fi
     - if [ "${ALPAKA_CI_ANALYSIS}" == "OFF" ] ;then ./travis/compileExec.sh "test/unit/meta/" ./meta ;fi
+    - if [ "${ALPAKA_CI_ANALYSIS}" == "OFF" ] ;then ./travis/compileExec.sh "test/unit/stream/" ./stream ;fi
     - if [ "${ALPAKA_CI_ANALYSIS}" == "OFF" ] ;then ./travis/compileExec.sh "test/unit/time/" ./time ;fi
     - if [ "${ALPAKA_CI_ANALYSIS}" == "OFF" ] ;then ./travis/compileExec.sh "test/unit/vec/" ./vec ;fi
 

diff --git a/doc/markdown/user/implementation/mapping/CUDA.md b/doc/markdown/user/implementation/mapping/CUDA.md
@@ -106,7 +106,7 @@ The following tables list the functions available in the [CUDA Runtime API](http
 
 |CUDA|alpaka|
 |---|---|
-|cudaStreamAddCallback|-|
+|cudaStreamAddCallback|alpaka::stream::enqueue(stream, \[\](){do_something();})|
 |cudaStreamAttachMemAsync|-|
 |cudaStreamCreate|<ul><li>stream = alpaka::stream::StreamCudaRtAsync(device);</li><li>stream = alpaka::stream::StreamCudaRtSync(device);</li></ul>|
 |cudaStreamCreateWithFlags|see cudaStreamCreate (cudaStreamNonBlocking hard coded)|

diff --git a/include/alpaka/stream/StreamCpuAsync.hpp b/include/alpaka/stream/StreamCpuAsync.hpp
@@ -235,25 +235,6 @@ namespace alpaka
                 //
                 //-----------------------------------------------------------------------------
                 ALPAKA_FN_HOST static auto enqueue(
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_CUDA_DEVICE)
-                    stream::StreamCpuAsync & stream,
-                    TTask & task)
-#else
-                    stream::StreamCpuAsync &,
-                    TTask &)
-#endif
-                -> void
-                {
-// Workaround: Clang can not support this when natively compiling device code. See ConcurrentExecPool.hpp.
-#if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_CUDA_DEVICE)
-                    stream.m_spAsyncStreamCpu->m_workerThread.enqueueTask(
-                        task);
-#endif
-                }
-                //-----------------------------------------------------------------------------
-                //
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
 #if !(BOOST_COMP_CLANG_CUDA && BOOST_ARCH_CUDA_DEVICE)
                     stream::StreamCpuAsync & stream,
                     TTask const & task)

diff --git a/include/alpaka/stream/StreamCpuSync.hpp b/include/alpaka/stream/StreamCpuSync.hpp
@@ -206,17 +206,6 @@ namespace alpaka
                 stream::StreamCpuSync,
                 TTask>
             {
-                //-----------------------------------------------------------------------------
-                //
-                //-----------------------------------------------------------------------------
-                ALPAKA_FN_HOST static auto enqueue(
-                    stream::StreamCpuSync & stream,
-                    TTask & task)
-                -> void
-                {
-                    boost::ignore_unused(stream);
-                    task();
-                }
                 //-----------------------------------------------------------------------------
                 //
                 //-----------------------------------------------------------------------------

diff --git a/include/alpaka/stream/StreamCudaRtAsync.hpp b/include/alpaka/stream/StreamCudaRtAsync.hpp
@@ -59,7 +59,7 @@ namespace alpaka
             namespace detail
             {
                 //#############################################################################
-                //! The CUDA RT stream implementation.
+                //! The CUDA RT async stream implementation.
                 //#############################################################################
                 class StreamCudaRtAsyncImpl final
                 {
@@ -132,7 +132,7 @@ namespace alpaka
         }
 
         //#############################################################################
-        //! The CUDA RT stream.
+        //! The CUDA RT async stream.
         //#############################################################################
         class StreamCudaRtAsync final
         {
@@ -191,7 +191,7 @@ namespace alpaka
         namespace traits
         {
             //#############################################################################
-            //! The CUDA RT stream device type trait specialization.
+            //! The CUDA RT async stream device type trait specialization.
             //#############################################################################
             template<>
             struct DevType<
@@ -200,7 +200,7 @@ namespace alpaka
                 using type = dev::DevCudaRt;
             };
             //#############################################################################
-            //! The CUDA RT stream device get trait specialization.
+            //! The CUDA RT async stream device get trait specialization.
             //#############################################################################
             template<>
             struct GetDev<
@@ -223,7 +223,7 @@ namespace alpaka
         namespace traits
         {
             //#############################################################################
-            //! The CUDA RT stream event type trait specialization.
+            //! The CUDA RT async stream event type trait specialization.
             //#############################################################################
             template<>
             struct EventType<
@@ -238,7 +238,79 @@ namespace alpaka
         namespace traits
         {
             //#############################################################################
-            //! The CUDA RT stream test trait specialization.
+            //! The CUDA RT sync stream enqueue trait specialization.
+            //#############################################################################
+            template<
+                typename TTask>
+            struct Enqueue<
+                stream::StreamCudaRtAsync,
+                TTask>
+            {
+                //#############################################################################
+                //!
+                //#############################################################################
+                struct CallbackSynchronizationData
+                {
+                    std::mutex m_mutex;
+                    std::condition_variable m_event;
+                    bool notified = false;
+                };
+
+                //-----------------------------------------------------------------------------
+                //
+                //-----------------------------------------------------------------------------
+                static void CUDART_CB cudaRtCallback(cudaStream_t /*stream*/, cudaError_t /*status*/, void *arg)
+                {
+                    auto& callbackSynchronizationData = *reinterpret_cast<CallbackSynchronizationData*>(arg);
+
+                    {
+                        std::unique_lock<std::mutex> lock(callbackSynchronizationData.m_mutex);
+                        callbackSynchronizationData.notified = true;
+                    }
+
+                    callbackSynchronizationData.m_event.notify_one();
+                }
+
+                //-----------------------------------------------------------------------------
+                //
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST static auto enqueue(
+                    stream::StreamCudaRtAsync & stream,
+                    TTask const & task)
+                -> void
+                {
+                    auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
+
+                    ALPAKA_CUDA_RT_CHECK(cudaStreamAddCallback(
+                        stream.m_spStreamCudaRtAsyncImpl->m_CudaStream,
+                        cudaRtCallback,
+                        pCallbackSynchronizationData.get(),
+                        0u));
+
+                    std::thread t(
+                        [pCallbackSynchronizationData, task](){
+
+                            // If the callback has not yet been called, we wait for it.
+                            std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
+                            if(!pCallbackSynchronizationData->notified)
+                            {
+                                pCallbackSynchronizationData->m_event.wait(
+                                    lock,
+                                    [pCallbackSynchronizationData](){
+                                        return pCallbackSynchronizationData->notified;
+                                    }
+                                );
+                            }
+
+                            task();
+                        }
+                    );
+
+                    t.detach();
+                }
+            };
+            //#############################################################################
+            //! The CUDA RT async stream test trait specialization.
             //#############################################################################
             template<>
             struct Empty<
@@ -269,7 +341,7 @@ namespace alpaka
         namespace traits
         {
             //#############################################################################
-            //! The CUDA RT stream thread wait trait specialization.
+            //! The CUDA RT async stream thread wait trait specialization.
             //!
             //! Blocks execution of the calling thread until the stream has finished processing all previously requested tasks (kernels, data copies, ...)
             //#############################################################################

diff --git a/include/alpaka/stream/StreamCudaRtSync.hpp b/include/alpaka/stream/StreamCudaRtSync.hpp
@@ -59,7 +59,7 @@ namespace alpaka
             namespace detail
             {
                 //#############################################################################
-                //! The CUDA RT stream implementation.
+                //! The CUDA RT sync stream implementation.
                 //#############################################################################
                 class StreamCudaRtSyncImpl final
                 {
@@ -132,7 +132,7 @@ namespace alpaka
         }
 
         //#############################################################################
-        //! The CUDA RT stream.
+        //! The CUDA RT sync stream.
         //#############################################################################
         class StreamCudaRtSync final
         {
@@ -191,7 +191,7 @@ namespace alpaka
         namespace traits
         {
             //#############################################################################
-            //! The CUDA RT stream device type trait specialization.
+            //! The CUDA RT sync stream device type trait specialization.
             //#############################################################################
             template<>
             struct DevType<
@@ -200,7 +200,7 @@ namespace alpaka
                 using type = dev::DevCudaRt;
             };
             //#############################################################################
-            //! The CUDA RT stream device get trait specialization.
+            //! The CUDA RT sync stream device get trait specialization.
             //#############################################################################
             template<>
             struct GetDev<
@@ -223,7 +223,7 @@ namespace alpaka
         namespace traits
         {
             //#############################################################################
-            //! The CUDA RT stream event type trait specialization.
+            //! The CUDA RT sync stream event type trait specialization.
             //#############################################################################
             template<>
             struct EventType<
@@ -238,7 +238,72 @@ namespace alpaka
         namespace traits
         {
             //#############################################################################
-            //! The CUDA RT stream test trait specialization.
+            //! The CUDA RT sync stream enqueue trait specialization.
+            //#############################################################################
+            template<
+                typename TTask>
+            struct Enqueue<
+                stream::StreamCudaRtSync,
+                TTask>
+            {
+                //#############################################################################
+                //!
+                //#############################################################################
+                struct CallbackSynchronizationData
+                {
+                    std::mutex m_mutex;
+                    std::condition_variable m_event;
+                    bool notified = false;
+                };
+
+                //-----------------------------------------------------------------------------
+                //
+                //-----------------------------------------------------------------------------
+                static void CUDART_CB cudaRtCallback(cudaStream_t /*stream*/, cudaError_t /*status*/, void *arg)
+                {
+                    auto& callbackSynchronizationData = *reinterpret_cast<CallbackSynchronizationData*>(arg);
+
+                    {
+                        std::unique_lock<std::mutex> lock(callbackSynchronizationData.m_mutex);
+                        callbackSynchronizationData.notified = true;
+                    }
+
+                    callbackSynchronizationData.m_event.notify_one();
+                }
+
+                //-----------------------------------------------------------------------------
+                //
+                //-----------------------------------------------------------------------------
+                ALPAKA_FN_HOST static auto enqueue(
+                    stream::StreamCudaRtSync & stream,
+                    TTask const & task)
+                -> void
+                {
+                    auto pCallbackSynchronizationData = std::make_shared<CallbackSynchronizationData>();
+
+                    ALPAKA_CUDA_RT_CHECK(cudaStreamAddCallback(
+                        stream.m_spStreamCudaRtSyncImpl->m_CudaStream,
+                        cudaRtCallback,
+                        pCallbackSynchronizationData.get(),
+                        0u));
+
+                    // If the callback has not yet been called, we wait for it.
+                    std::unique_lock<std::mutex> lock(pCallbackSynchronizationData->m_mutex);
+                    if(!pCallbackSynchronizationData->notified)
+                    {
+                        pCallbackSynchronizationData->m_event.wait(
+                            lock,
+                            [pCallbackSynchronizationData](){
+                                return pCallbackSynchronizationData->notified;
+                            }
+                        );
+                    }
+
+                    task();
+                }
+            };
+            //#############################################################################
+            //! The CUDA RT sync stream test trait specialization.
             //#############################################################################
             template<>
             struct Empty<
@@ -269,7 +334,7 @@ namespace alpaka
         namespace traits
         {
             //#############################################################################
-            //! The CUDA RT stream thread wait trait specialization.
+            //! The CUDA RT sync stream thread wait trait specialization.
             //!
             //! Blocks execution of the calling thread until the stream has finished processing all previously requested tasks (kernels, data copies, ...)
             //#############################################################################

diff --git a/include/alpaka/stream/Traits.hpp b/include/alpaka/stream/Traits.hpp
@@ -61,8 +61,9 @@ namespace alpaka
         //-----------------------------------------------------------------------------
         //! Queues the given task in the given stream.
         //!
-        //! If it has previously been queued, then this call will overwrite any existing state of the event.
-        //! Any subsequent calls which examine the status of event will only examine the completion of this most recent call to enqueue.
+        //! Special Handling for events:
+        //!   If the event has previously been queued, then this call will overwrite any existing state of the event.
+        //!   Any subsequent calls which examine the status of event will only examine the completion of this most recent call to enqueue.
         //-----------------------------------------------------------------------------
         template<
             typename TStream,