In [1]:
!pip install pycuda

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycuda
  Downloading pycuda-2022.2.2.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pytools>=2011.2
  Downloading pytools-2022.1.14.tar.gz (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.6/74.6 KB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting appdirs>=1.4.0
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Building w

**In this demo we will run kernels sequentially** <br>
The next kernel function will start when the previous finished.

In [2]:
import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
import numpy as np
from time import time


In [3]:
num_arrays = 200 # We will create 200 random arrays

array_len = 1024**2 # the length of each array


In [4]:
ker = SourceModule("""       
__global__ void mult_ker(float * array, int array_len)
{
     
     // we will use one block with blockDim.x=64 number of threads.
     
     // we will need num_iters number of iterations 
     // to process one array with the 64 threads
     int num_iters = array_len / blockDim.x;

     // the current thread id out of the 64 threads
     int thd = blockIdx.x*blockDim.x + threadIdx.x; 
     
     // iterate over different parts of the array
     for(int j=0; j < num_iters; j++)
     {
         int i = j * blockDim.x + thd;

         for(int k = 0; k < 50; k++) // run this 50  times just to spend some time
         {
              array[i] *= 2.0; //multiply by 2.0
              array[i] /= 2.0; //then devide by 2.0
         }
     }

}
""")

mult_ker = ker.get_function('mult_ker')


In [5]:
data = []
data_gpu = []
gpu_out = []


In [6]:
# generate 200 random arrays and put them into the data list
for _ in range(num_arrays):
    data.append(np.random.randn(array_len).astype('float32'))


In [9]:
print(len(data))
print(data[0].shape)

200
(1048576,)


In [10]:
t_start = time()

# copy arrays to the GPU. Serial upload
for k in range(num_arrays):
    data_gpu.append(gpuarray.to_gpu(data[k]))

# process arrays. Run num_arrays (= 200) kernel functions serially.
for k in range(num_arrays):
    mult_ker(data_gpu[k], np.int32(array_len), block=(64,1,1), grid=(1,1,1))

# copy arrays from GPU. Serial data download.
for k in range(num_arrays):
    gpu_out.append(data_gpu[k].get())

t_end = time()

print('Total time: %f' % (t_end - t_start))


Total time: 2.756657


In [11]:
# check if output looks good.
for k in range(num_arrays):
    assert (np.allclose(gpu_out[k], data[k]))

