In [None]:
!pip3 install pycuda

Collecting pycuda
[?25l  Downloading https://files.pythonhosted.org/packages/5e/3f/5658c38579b41866ba21ee1b5020b8225cec86fe717e4b1c5c972de0a33c/pycuda-2019.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 4.9MB/s 
[?25hCollecting pytools>=2011.2 (from pycuda)
[?25l  Downloading https://files.pythonhosted.org/packages/00/96/00416762a3eda8876a17d007df4a946f46b2e4ee1057e0b9714926472ef8/pytools-2019.1.1.tar.gz (58kB)
[K     |████████████████████████████████| 61kB 17.0MB/s 
Collecting appdirs>=1.4.0 (from pycuda)
  Downloading https://files.pythonhosted.org/packages/56/eb/810e700ed1349edde4cbdc1b2a21e28cdf115f9faf263f6bbf8447c1abf3/appdirs-1.4.3-py2.py3-none-any.whl
Collecting mako (from pycuda)
[?25l  Downloading https://files.pythonhosted.org/packages/b0/3c/8dcd6883d009f7cae0f3157fb53e9afb05a0d3d33b3db1268ec2e6f4a56b/Mako-1.1.0.tar.gz (463kB)
[K     |████████████████████████████████| 471kB 48.4MB/s 
Building wheels for collected packages: pycuda, pytools, mako
  B

In [None]:
import math
import numpy as np
import pycuda.gpuarray as gpuarray
from pycuda.compiler import DynamicSourceModule
import pycuda.autoinit

In [None]:
module = DynamicSourceModule("""
__global__ void add_two_vector(int nx, float *arr1, float *arr2, float *res){
    int x = threadIdx.x + blockDim.x * blockIdx.x;
    if (x < nx){
        res[x] = arr1[x] + arr2[x];
    }
}
__global__ void add_two_vector_dynamic(int *grid, int *block, int nx, float *arr1, float *arr2, float *res){
dim3 grid_ = dim3(grid[0], grid[1], grid[2]);
dim3 block_ = dim3(block[0], block[1], block[2]);
add_two_vector<<<grid_, block_>>>(nx, arr1, arr2, res);
}
""")

In [None]:
add_two_vector_dynamic = module.get_function("add_two_vector_dynamic")

In [None]:
num_comp = np.int32(10)
arr1 = np.arange(num_comp, dtype=np.float32)
arr2 = np.arange(num_comp, dtype=np.float32)

In [None]:
np.random.shuffle(arr2)

In [None]:
res_gpu = gpuarray.zeros(num_comp, dtype=np.float32)

In [None]:
threads_per_block = (256, 1, 1)
blocks_per_grid = (math.ceil(num_comp / threads_per_block[0]), 1, 1)

In [None]:
block = np.array(threads_per_block, dtype=np.int32)
grid = np.array(blocks_per_grid, dtype=np.int32)

In [None]:
arr1_gpu = gpuarray.to_gpu(arr1)
arr2_gpu = gpuarray.to_gpu(arr2)
block_gpu = gpuarray.to_gpu(block)
grid_gpu = gpuarray.to_gpu(grid)

In [None]:
add_two_vector_dynamic(grid_gpu, block_gpu, num_comp, arr1_gpu, arr2_gpu, res_gpu, block=(1, 1, 1), grid=(1, 1, 1))

In [None]:
res_gpu.get()

array([ 7.,  5.,  5.,  9.,  6., 14., 14., 12.,  9.,  9.], dtype=float32)

In [None]:
arr1 + arr2

array([ 7.,  5.,  5.,  9.,  6., 14., 14., 12.,  9.,  9.], dtype=float32)