In [1]:
#!pip install pycuda

In [2]:
from __future__ import division
import numpy as np
from pycuda.compiler import SourceModule
import pycuda.autoinit
from pycuda import gpuarray
import pycuda.driver as drv


In [3]:
AtomicCode='''
__global__ void atomic_ker(int *add_out, int *max_out) 
{
 int tid = blockIdx.x*blockDim.x + threadIdx.x;
 
 // sets *add_out to 0.  Thread-safe.
 atomicExch(add_out, 0);
 __syncthreads();
 
 // adds "1" to *add_out for each thread.
 atomicAdd(add_out, 1);

 // sets max_out to the maximum value submitted across all threads.
 atomicMax(max_out, tid);
}

// Simple kernel without Atominc operations
// This will fail
__global__ void simple_add_ker(int *add_out) 
{
 __syncthreads(); 
 //add_out[0] = add_out[0] +1;
 *add_out = *add_out +1;
 __syncthreads(); 
}
'''


In [4]:
my_mod = SourceModule(AtomicCode)
atomic_ker = my_mod.get_function('atomic_ker')
simple_add_ker = my_mod.get_function('simple_add_ker')

In [5]:
add_out = gpuarray.empty((1,), dtype=np.int32)
simple_add_ker(add_out, grid=(1,1,1), block=(1000,1,1))
drv.Context.synchronize()

print(f'Increment fails without Atomic operations :') 
print(f'add_out: {add_out.get()}' ) #this should be 1000!

Increment fails without Atomic operations :
add_out: [1]


### Atomic Operations

In [6]:
add_out = gpuarray.empty((1,), dtype=np.int32)
max_out = gpuarray.empty((1,), dtype=np.int32)


In [7]:
atomic_ker(add_out, max_out, grid=(1,1,1), block=(1000,1,1))
drv.Context.synchronize()


In [8]:
print(f'Atomic operations test:') 
print(f'add_out: {add_out.get()}' )
print(f'max_out [max of the tid values]: {max_out.get()}')


Atomic operations test:
add_out: [1000]
max_out [max of the tid values]: [999]
