Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Race condition in CUDA blocking queue #850

Closed
psychocoderHPC opened this issue Sep 26, 2019 · 15 comments
Closed

Race condition in CUDA blocking queue #850

psychocoderHPC opened this issue Sep 26, 2019 · 15 comments

Comments

@psychocoderHPC
Copy link
Member

psychocoderHPC commented Sep 26, 2019

@krzikalla provided me with a test case where we have dataraces between a synchronous memcopy and a kernel.

The problem is that we create the streams with cudaStreamCreateWithFlags(..., cudaStreamNonBlocking). The cuda documentation that cudaStreamNonBlocking disable the blocking behavior to the default stream 0. In alpaka we use blocking memcopy operations e.g.cudaMemcpy if we use the QueueCudaRtBlocking. The result is that our memcopy operations are running non blocking to all enqueued kernel.

This BUG is also available in the last release 0.3.5. This means we need to do a bugfix release even we release soon 0.4.0.

We have different possibilities to solve it

  • create blocking queues with cudaStreamCreate()
    • this will introduce the side effect that a memcopy will block all stream
  • use asynchronously memcopy and do an explicit synchronization after each call (
    • will increase the host side overhead since a API call has a latency of ~10 - 16 us
#include <stdio.h>
#include "alpaka/alpaka.hpp"

//#define NATIVE_CUDA 0
#ifdef NATIVE_CUDA

#define CUDA_ASSERT(x) if ((x) != cudaSuccess) { printf("cuda fail on line %d", __LINE__); exit(1); }


__global__ void emptyCudaKernel(int threadElementExtent)
{
  assert(threadElementExtent == 1);
}

__global__ void myCudaKernel(const double* sourceData)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  // note: here we are supposed to check that i is in the array range
  // but this is not what is causing the issue
  if (sourceData[i] != 1.0)
    printf("%u %u %u: %f\n",blockDim.x, blockIdx.x, threadIdx.x, sourceData[i]);
}



int main(int argc, char* argv[])
{
  cudaStream_t cudaStream;
  void * memPtr;
  const size_t size = 900 * 5 * 5;

  CUDA_ASSERT(cudaSetDevice(0));
  CUDA_ASSERT(cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking));  
  CUDA_ASSERT(cudaMalloc(&memPtr, size * sizeof(double)));

  // note: here we assume size is not a multiple of 64, this is unrelated to the issue
  dim3 gridDim(std::size_t(((size - 1) / 64) + 1), 1u, 1u);
  dim3 blockDim(64u, 1u, 1u);
  emptyCudaKernel<<<gridDim, blockDim, 0, cudaStream>>>(argc);
  CUDA_ASSERT(cudaStreamSynchronize(cudaStream));
  
  std::vector<double> sourceMemHost(size, 1.0);
  CUDA_ASSERT(cudaMemcpy(memPtr, sourceMemHost.data(), size * sizeof(double), cudaMemcpyHostToDevice));
  cudaStreamSynchronize(cudaStream);
 
  myCudaKernel<<<gridDim, blockDim, 0, cudaStream>>>((double*)memPtr);

  CUDA_ASSERT(cudaStreamSynchronize(cudaStream));
  CUDA_ASSERT(cudaStreamDestroy(cudaStream));
  return 0;
}
  

#else

struct MyKernel
{
   template<typename Acc>
   ALPAKA_FN_ACC void operator()(Acc const & acc, const double* sourceData) const
   {
     int i = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
     // note (same as for CUDA): here we are supposed to check that i is in the array range
     // but this is not what is causing the issue
     if(sourceData[i] != 1.0)
       printf("%u %u %u %lu\n",blockDim.x, blockIdx.x, threadIdx.x, sourceData[i]);
   }
};

struct EmptyKernel
{
   template<typename Acc>
   ALPAKA_FN_ACC void operator()(Acc const & acc, int threadElementExtent) const
   {
     assert(threadElementExtent == 1);
   }
};



int main(int argc, char* argv[])
{
   const size_t size = 900 * 5 * 5;

   using ComputeAccelerator = alpaka::acc::AccGpuCudaRt<alpaka::dim::DimInt<1>, std::size_t>;
   using ComputeDevice = alpaka::dev::Dev<ComputeAccelerator>;
   using ComputeStream = alpaka::stream::StreamCudaRtSync;

   ComputeDevice computeDevice(alpaka::pltf::getDevByIdx<alpaka::pltf::Pltf<ComputeDevice> >(0));
   ComputeStream computeStream (computeDevice);

   using V = alpaka::vec::Vec<alpaka::dim::DimInt<1>, std::size_t>;
   using WorkDivision = alpaka::workdiv::WorkDivMembers<alpaka::dim::DimInt<1>, std::size_t>;
   WorkDivision wd(V(std::size_t(((size - 1) / 64) + 1)), V(std::size_t(64)), V(std::size_t(1)));

   using HostAccelerator = alpaka::acc::AccCpuOmp2Blocks<alpaka::dim::DimInt<1>, std::size_t>;
   using HostDevice = alpaka::dev::Dev<HostAccelerator>;
   alpaka::vec::Vec<alpaka::dim::DimInt<1>, size_t> bufferSize (size);
   using HostBufferType = decltype(
     alpaka::mem::buf::alloc<double, size_t>(std::declval<HostDevice>(), bufferSize));
   using HostViewType = alpaka::mem::view::ViewPlainPtr<alpaka::dev::Dev<HostBufferType>,
     alpaka::elem::Elem<HostBufferType>, alpaka::dim::Dim<HostBufferType>, alpaka::size::Size<HostBufferType> >;

   HostDevice hostDevice(alpaka::pltf::getDevByIdx<alpaka::pltf::Pltf<HostDevice> >(0u));

   auto sourceMem = alpaka::mem::buf::alloc<double, size_t>(computeDevice, size);

   alpaka::stream::enqueue(computeStream, alpaka::exec::create<ComputeAccelerator>(wd, EmptyKernel(), argc));

   std::vector<double> sourceMemHost(size, 1.0);
   HostViewType hostBufferView(sourceMemHost.data(), hostDevice, bufferSize);
   alpaka::mem::view::copy(computeStream, sourceMem, hostBufferView, bufferSize);
   alpaka::wait::wait(computeStream);

   alpaka::stream::enqueue(computeStream,
     alpaka::exec::create<ComputeAccelerator>(wd, MyKernel(), alpaka::mem::view::getPtrNative(sourceMem)));

   alpaka::wait::wait(computeStream);

   return 0;
}
#endif
@BenjaminW3
Copy link
Member

BenjaminW3 commented Sep 28, 2019

This is somehow related to #768, #783 and #791 where we added support for using CPU queues from multiple host threads.
This was not supposed to work in the 3.x release line and so I would not port back this fix. It also does not work for CPU queues there. Porting back everything needed to make this feature work in the old releases would be a huge amount of work which I would not want anyone to waste his time on.
For the 4.0 release on the other hand it is an issue that should be fixed because we now also support this for the CPU queues.

@BenjaminW3
Copy link
Member

What we implemented for the blocking CPU queue is the usage of a std::mutex that is locked. This results in only one task of multiple concurring host threads trying to execute something is ever executing at a specific time, while all host threads are blocked waiting. However the order in which the waiting tasks are executed is not specified (nobody guarantees first come - first serve). If you rely on the order of tasks being executed on the blocking CPU queue you still have to do manual synchronization within the host threads so that only one thread is ever trying to execute something in the queue at any time.
The only thing that the blocking CPU queue currently achieves is that only one task i ever executed at any time even when called from multiple threads. It does NOT preserve the order of calls. You might see this as a bug but until now this was no requirment (the same is true for this "bug").

@BenjaminW3
Copy link
Member

BenjaminW3 commented Sep 28, 2019

The blocking GPU queue does not even achieve this "only one task in the queue is executed at any time" property when called from multiple threads as this was no requirement up until now. If you enqueued two memcpys from two host threads they are executed in parallel. Also the currentThreadWaitFor and empty functions on a blocking GPU queue do not work as expected when called from a different host thread because the blocking CUDA operations are never enqueued to a CUDA queue at all.

@BenjaminW3
Copy link
Member

To fix the issue that currentThreadWaitFor and empty do not work we have to use the asynchronous CUDA memory operations together with cudaStreamSynchronize.

There would still be an issue that multiple tasks that are enqueued in parrallel into a blocking GPU queue may overlap their calls to cudaXxxAsync and cudaStreamSynchronize leading to

cudaXxxAsync1
cudaXxxAsync2
cudaStreamSynchronize1
cudaStreamSynchronize2

being received by the CUDA queue. This would be a pessimization because the host thread 1 would wait for the task enqueued by host thread 1 and 2. To prevent such things we would have to guard the access to the internal CUDA queue with host side synchronization (std::mutex lock).

Thi would on the other hand introduce the issue that std::mutex lock is not first come - first serve which is also present for the blocking CPU queue. To fix this we would have to adapt something like this.

@psychocoderHPC
Copy link
Member Author

This is somehow related to #768, #783 and #791 where we added support for using CPU queues from multiple host threads.
This was not supposed to work in the 3.x release line and so I would not port back this fix. It also does not work for CPU queues there. Porting back everything needed to make this feature work in the old releases would be a huge amount of work which I would not want anyone to waste his time on.
For the 4.0 release on the other hand it is an issue that should be fixed because we now also support this for the CPU queues.

Maybe I understood you wrong but we need to fix it on 0.3.5. The example shows that we enqueue the kernel and memcpy into a stream but since we ignore the stream in the implementation for enqueu in the blocking queue wo got the data race.
A simple fix for 0.3.5 is to use cudaStreamCreate without any flags than all works.

For 0.4.0 O will provide a fix on monday. To have the same behaviour on CPU and GPu we should use always the async api and wait after each memcopy.

@BenjaminW3
Copy link
Member

A simple fix for 0.3.5 is to use cudaStreamCreate without any flags than all works.

You can do this, but as you already wrote above this will have an undesired side effect that could also be classified as a bug. This will fix a bug for some people but introduce a bug for others.

Where exactly is the bug that you described? In your example you only have one stream and only one host CPU thread. In an abstract way what I see is the following order of tasks called on an StreamCudaRtSync:

alpaka::stream::enqueue()
alpaka::mem::view::copy()
alpaka::wait::wait()
alpaka::stream::enqueue()
alpaka::wait::wait()

The wait calls should not be necessary at all, the enqueue (has an cudaStreamSynchronize) and copy (uses cudaMemcpy) tasks should be blocking for this queue. What am I missing?

@psychocoderHPC
Copy link
Member Author

You missed that cudaMemcpy is not using any stream and is therefore enqueded in the cuda default stream instead of the user requested stream/queue. Because of that the memcpy is performed parallel/asynchronous to the kernel.
Normally cudaMemcpy is blocking all streams and is not started by the cuda driver before the work in all other streams is performed. But since we created our cuda streams we use in alpaka with cudaStreamCreate flag we explicit disabled this behavior. So all cuda api calls we use without a stream argument e.g. cudaaMemcpy will be performed sequential but kernel will run in the user given stream/queue and will never wait.

@psychocoderHPC
Copy link
Member Author

I looked again to my cuda example above and realised that I copied the version where I used cudaMemcpyAsyn instead of cudaMemcpy. Maybe you are therefore can not see the issue. sry for that. I will fix it in a few seconds

@psychocoderHPC
Copy link
Member Author

mhhhh now I also confused. Normally cudaMemcpy must block the executor thread until the memcpy finished. It looks like it is not the case. I will have a look on this issue on Monday again.

@BenjaminW3
Copy link
Member

This is exactly what I am wondering as well. My understanding was that cudaMemcpy blocks the calling host CPU thread.

@krzikalla
Copy link

Maybe alpaka calls cudaMemcpyPeer?
IIRC the native cuda code in the above example works while the alpaka code doesn't. The actual design flaw is ignoring the stream for alpaka::mem::view::copy. Maybe the correct solution for 0.4.0 is to use cudaMemcpyAsync. It is a bug nevertheless.
BTW we don't bother about a backport in 0.3.5.

@sbastrakov
Copy link
Member

@krzikalla no, in that code snippet both CUDA and alpaka do not work due to race condition (so sometimes it might run fine, but not guaranteed). And they should not because of cudaMemcpy in the default stream not synchronizing with other streams created with cudaStreamNonBlocking. It is supposed to not synchronize, which was not clear to us before. So as suggested above, we could either use default stream creation flag instead of cudaStreamNonBlocking or use cudaMemcpyAsync with proper streams instead of cudaMemcpy.

psychocoderHPC added a commit to psychocoderHPC/alpaka that referenced this issue Sep 30, 2019
fix alpaka-group#850

By using `cudaStreamCreateWithFlags(..., cudaStreamNonBlocking)` we
disabling the blocking behavior of `cudaMemcpy` and other blocking cuda
API calls.
By using `cudaStreamDefault` we can enforce the old `legacy` behavior.
@BenjaminW3
Copy link
Member

What is the plan for this issue for the 0.4.0 release?

@psychocoderHPC
Copy link
Member Author

My plan is to switch fully to the async api to have the same behavior for all backends.
I will work on that this week.

@psychocoderHPC
Copy link
Member Author

psychocoderHPC commented Nov 18, 2019

Test code for the upcoming alpaka 0.4.0 (added all renamings):

#include <stdio.h>
#include "alpaka/alpaka.hpp"

//#define NATIVE_CUDA 0
#ifdef NATIVE_CUDA

#define CUDA_ASSERT(x) if ((x) != cudaSuccess) { printf("cuda fail on line %d", __LINE__); exit(1); }


__global__ void emptyCudaKernel(int threadElementExtent)
{
  assert(threadElementExtent == 1);
}

__global__ void myCudaKernel(const double* sourceData)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  constexpr size_t size = 900 * 5 * 5;
  // note: here we are supposed to check that i is in the array range
  // but this is not what is causing the issue
  if (i < size && sourceData[i] != 1.0)
    printf("%u %u %u: %f\n",blockDim.x, blockIdx.x, threadIdx.x, sourceData[i]);
}



int main(int argc, char* argv[])
{
  cudaStream_t cudaStream;
  void * memPtr;
  const size_t size = 900 * 5 * 5;

  CUDA_ASSERT(cudaSetDevice(0));
  CUDA_ASSERT(cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking));
  CUDA_ASSERT(cudaMalloc(&memPtr, size * sizeof(double)));

  // note: here we assume size is not a multiple of 64, this is unrelated to the issue
  dim3 gridDim(std::size_t(((size - 1) / 64) + 1), 1u, 1u);
  dim3 blockDim(64u, 1u, 1u);
  emptyCudaKernel<<<gridDim, blockDim, 0, cudaStream>>>(argc);
  CUDA_ASSERT(cudaStreamSynchronize(cudaStream));

  std::vector<double> sourceMemHost(size, 1.0);
  CUDA_ASSERT(cudaMemcpy(memPtr, sourceMemHost.data(), size * sizeof(double), cudaMemcpyHostToDevice));
  cudaStreamSynchronize(cudaStream);

  myCudaKernel<<<gridDim, blockDim, 0, cudaStream>>>((double*)memPtr);

  CUDA_ASSERT(cudaStreamSynchronize(cudaStream));
  CUDA_ASSERT(cudaStreamDestroy(cudaStream));
  return 0;
}


#else

struct MyKernel
{
   template<typename Acc>
   ALPAKA_FN_ACC void operator()(Acc const & acc, const double* sourceData) const
   {
     constexpr size_t size = 900 * 5 * 5;
     int i = alpaka::idx::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u];
     // note (same as for CUDA): here we are supposed to check that i is in the array range
     // but this is not what is causing the issue
     if(i < size && sourceData[i] != 1.0)
       printf("%u %u %u %lu\n",blockDim.x, blockIdx.x, threadIdx.x, sourceData[i]);
   }
};

struct EmptyKernel
{
   template<typename Acc>
   ALPAKA_FN_ACC void operator()(Acc const & acc, int threadElementExtent) const
   {
     assert(threadElementExtent == 1);
   }
};



int main(int argc, char* argv[])
{
   const size_t size = 900 * 5 * 5;

   using ComputeAccelerator = alpaka::acc::AccGpuCudaRt<alpaka::dim::DimInt<1>, std::size_t>;
   using ComputeDevice = alpaka::dev::Dev<ComputeAccelerator>;
   using ComputeStream = alpaka::queue::QueueCudaRtBlocking;

   ComputeDevice computeDevice(alpaka::pltf::getDevByIdx<alpaka::pltf::Pltf<ComputeDevice> >(0));
   ComputeStream computeStream (computeDevice);

   using V = alpaka::vec::Vec<alpaka::dim::DimInt<1>, std::size_t>;
   using WorkDivision = alpaka::workdiv::WorkDivMembers<alpaka::dim::DimInt<1>, std::size_t>;
   WorkDivision wd(V(std::size_t(((size - 1) / 64) + 1)), V(std::size_t(64)), V(std::size_t(1)));

   using HostAccelerator = alpaka::acc::AccCpuOmp2Blocks<alpaka::dim::DimInt<1>, std::size_t>;
   using HostDevice = alpaka::dev::Dev<HostAccelerator>;
   alpaka::vec::Vec<alpaka::dim::DimInt<1>, size_t> bufferSize (size);
   using HostBufferType = decltype(
     alpaka::mem::buf::alloc<double, size_t>(std::declval<HostDevice>(), bufferSize));
   using HostViewType = alpaka::mem::view::ViewPlainPtr<alpaka::dev::Dev<HostBufferType>,
     alpaka::elem::Elem<HostBufferType>, alpaka::dim::Dim<HostBufferType>, alpaka::idx::Idx<HostBufferType> >;

   HostDevice hostDevice(alpaka::pltf::getDevByIdx<alpaka::pltf::Pltf<HostDevice> >(0u));

   auto sourceMem = alpaka::mem::buf::alloc<double, size_t>(computeDevice, size);

   alpaka::queue::enqueue(computeStream, alpaka::kernel::createTaskKernel<ComputeAccelerator>(wd, EmptyKernel(), argc));

   std::vector<double> sourceMemHost(size, 1.0);
   HostViewType hostBufferView(sourceMemHost.data(), hostDevice, bufferSize);
   alpaka::mem::view::copy(computeStream, sourceMem, hostBufferView, bufferSize);
   alpaka::wait::wait(computeStream);

   alpaka::queue::enqueue(computeStream,
     alpaka::kernel::createTaskKernel<ComputeAccelerator>(wd, MyKernel(), alpaka::mem::view::getPtrNative(sourceMem)));

   alpaka::wait::wait(computeStream);

   return 0;
}
#endif

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

4 participants