In [None]:
#!pip install pycuda

In [None]:
from __future__ import division
import numpy as np
from pycuda.compiler import SourceModule
import pycuda.autoinit
from pycuda import gpuarray
import pycuda.driver as drv


In [None]:
AtomicCode='''
__global__ void atomic_ker(int *add_out, int *max_out) 
{
 int tid = blockIdx.x*blockDim.x + threadIdx.x;
 
 // sets *add_out to 0.  Thread-safe.
 atomicExch(add_out, 0);
 __syncthreads();
 
 // adds "1" to *add_out for each thread.
 atomicAdd(add_out, 1);
 
 // sets max_out to the maximum value submitted across all threads.
 atomicMax(max_out, tid);
}
'''


In [None]:
atomic_mod = SourceModule(AtomicCode)
atomic_ker = atomic_mod.get_function('atomic_ker')


In [None]:
add_out = gpuarray.empty((1,), dtype=np.int32)
max_out = gpuarray.empty((1,), dtype=np.int32)


In [None]:
atomic_ker(add_out, max_out, grid=(1,1,1), block=(1000,1,1))
drv.Context.synchronize()


In [None]:
print(f'Atomic operations test:') 
print(f'add_out: {add_out.get()}' )
print(f'max_out [max of the tid values]: {max_out.get()}')


Atomic operations test:
add_out: [1000]
max_out [max of the tid values]: [999]
