In [8]:
#!pip install pycuda

In [11]:
%%writefile atomicTest.cu

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>


extern "C" __global__ void atomic_ker(int *add_out, int *max_out) 
{
 printf("Hello from cuda!\n") ;
 int tid = blockIdx.x*blockDim.x + threadIdx.x;
 
 // sets *add_out to 0.  Thread-safe.
 atomicExch(add_out, 0);
 __syncthreads();
 
 // adds "1" to *add_out for each thread.
 atomicAdd(add_out, 1);
 
 // sets max_out to the maximum value submitted across all threads.
 atomicMax(max_out, tid);
}

Overwriting atomicTest.cu


In [12]:
!nvcc -ptx -o atomicTest.ptx atomicTest.cu

In [13]:
%%writefile test.py

from __future__ import division
import numpy as np
from pycuda.compiler import SourceModule
import pycuda.autoinit
from pycuda import gpuarray
import pycuda.driver as drv

atomic_mod = pycuda.driver.module_from_file('./atomicTest.ptx')
atomic_ker = atomic_mod.get_function('atomic_ker')


add_out = gpuarray.empty((1,), dtype=np.int32)
max_out = gpuarray.empty((1,), dtype=np.int32)
atomic_ker(add_out, max_out, grid=(1,1,1), block=(20,1,1))
drv.Context.synchronize()

print(f'Atomic operations test:') 
print(f'add_out: {add_out.get()}' )
print(f'max_out [max of the tid values]: {max_out.get()}')

Overwriting test.py


In [14]:
!python test.py


Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Hello from cuda!
Atomic operations test:
add_out: [20]
max_out [max of the tid values]: [19]


Atomic operations test:
add_out: [1000]
max_out [max of the tid values]: [999]
