In [None]:
!pip3 install pycuda

Collecting pycuda
[?25l  Downloading https://files.pythonhosted.org/packages/5e/3f/5658c38579b41866ba21ee1b5020b8225cec86fe717e4b1c5c972de0a33c/pycuda-2019.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 6.7MB/s 
[?25hCollecting pytools>=2011.2 (from pycuda)
[?25l  Downloading https://files.pythonhosted.org/packages/00/96/00416762a3eda8876a17d007df4a946f46b2e4ee1057e0b9714926472ef8/pytools-2019.1.1.tar.gz (58kB)
[K     |████████████████████████████████| 61kB 23.2MB/s 
Collecting appdirs>=1.4.0 (from pycuda)
  Downloading https://files.pythonhosted.org/packages/56/eb/810e700ed1349edde4cbdc1b2a21e28cdf115f9faf263f6bbf8447c1abf3/appdirs-1.4.3-py2.py3-none-any.whl
Collecting mako (from pycuda)
[?25l  Downloading https://files.pythonhosted.org/packages/b0/3c/8dcd6883d009f7cae0f3157fb53e9afb05a0d3d33b3db1268ec2e6f4a56b/Mako-1.1.0.tar.gz (463kB)
[K     |████████████████████████████████| 471kB 39.9MB/s 
Building wheels for collected packages: pycuda, pytools, mako
  B

In [None]:
import math
import numpy as np
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import pycuda.autoinit

In [None]:
module = SourceModule("""
__global__ void add_two_array_3d(int nx, int ny, int nz, float *res, float *arr1, float *arr2){
  int x = threadIdx.x + blockDim.x * blockIdx.x;
  int y = threadIdx.y + blockDim.y * blockIdx.y;
  int z = threadIdx.z + blockDim.z * blockIdx.z;
  int ijk = nx * ny * z + nx * y + x;
  if (x < nx && y < ny && z < nz){
      res[ijk] = arr1[ijk] + arr2[ijk];
  }
}
""")

In [None]:
add_two_array = module.get_function("add_two_array_3d")

In [None]:
num_x = np.int32(3)
num_y = np.int32(3)
num_z = np.int32(3)
num_components = num_x * num_y * num_z

In [None]:
arr1 = np.arange(num_components, dtype=np.float32).reshape(num_z, num_y, num_x)

In [None]:
arr2 = np.ones([num_z, num_y, num_x], dtype=np.float32)

In [None]:
res_gpu = gpuarray.zeros([num_z, num_y, num_x], dtype=np.float32)

In [None]:
arr1_gpu = gpuarray.to_gpu(arr1)
arr2_gpu = gpuarray.to_gpu(arr2)

In [None]:
threads_per_block = (6, 6, 6)
block_x = math.ceil(num_x / threads_per_block[0])
block_y = math.ceil(num_y / threads_per_block[1])
block_z = math.ceil(num_z / threads_per_block[2])
blocks_per_grid = (block_x, block_y, block_z)

In [None]:
add_two_array(num_x, num_y, num_z, res_gpu, arr1_gpu, arr2_gpu, block=threads_per_block, grid=blocks_per_grid)

In [None]:
res_gpu.get()

array([[[ 1.,  2.,  3.],
        [ 4.,  5.,  6.],
        [ 7.,  8.,  9.]],

       [[10., 11., 12.],
        [13., 14., 15.],
        [16., 17., 18.]],

       [[19., 20., 21.],
        [22., 23., 24.],
        [25., 26., 27.]]], dtype=float32)