## Some examples how to use CUDA in Python with Numba
stolen from / inspired by https://github.com/ContinuumIO/gtc2017-numba

In [1]:
import numpy as np
from numba import vectorize
from numba import cuda

In [2]:
# a somewhat dumb way to call numpy to add two arrays
def Add_cpu(a, b):
    return a+b

In [3]:
# vectorize with numba for the GPU
@vectorize(['float32(float32, float32)'], target='cuda')
def Add(a, b):
  return a+b

In [5]:
# alternatively define a proper CUDA kernel
@cuda.jit
def add_kernel(x, y, out):
    start = cuda.grid(1)      # 1 = one dimensional thread grid, returns a single value
    stride = cuda.gridsize(1) # ditto

    # assuming x and y inputs are same length
    for i in range(start, x.shape[0], stride):
        out[i] = x[i] + y[i]

In [6]:
# Initialize arrays
N = 10000
A = np.ones(N, dtype=np.float32)
B = np.ones(A.shape, dtype=A.dtype)
C = np.empty_like(A, dtype=A.dtype)

In [7]:
# Add arrays on CPU
% timeit Add_cpu(A, B)

3.84 µs ± 35.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [8]:
# Add arrays on GPU
% timeit Add(A,B)

1.23 ms ± 91.1 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
# Or differntly
threads_per_block = 512
blocks_per_grid = int((N +threads_per_block -1)/threads_per_block)
% timeit add_kernel[blocks_per_grid, threads_per_block](A,B,C)

1.4 ms ± 444 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [10]:
# or by splitting copying and kernel call
a_device = cuda.to_device(A) #allocates global memory on the device and copies data from host to device
b_device = cuda.to_device(B)
out_device = cuda.device_array_like(C)

In [11]:
% timeit [cuda.synchronize(), add_kernel[blocks_per_grid, threads_per_block](a_device,b_device,out_device), cuda.synchronize()]

379 µs ± 44.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


This is all, not surprisingly, much slower. The inputs are small and the calculation simple. Overhead from copying the inputs easily dominates. Also NumPy is not bad to start with.

Let's try something else:

In [12]:
import math  # Note that for the CUDA target, we need to use the scalar functions from the math module, not NumPy

@vectorize(['float32(float32, float32, float32)'], target='cuda')
def gaussian_pdf(x, mean, sigma):
    '''Compute the value of a Gaussian probability density function at x with given mean and sigma.'''
    return math.exp(-0.5 * ((x - mean) / sigma)**2) / (sigma * np.float32((2*math.pi)**0.5))

In [13]:
# Evaluate the Gaussian a million times!
x = np.random.uniform(-3, 3, size=1000000).astype(np.float32)
mean = np.float32(0.0)
sigma = np.float32(1.0)

In [14]:
# on the CPU with normal python
import scipy.stats # for definition of gaussian distribution
norm_pdf = scipy.stats.norm
%timeit norm_pdf.pdf(x, loc=mean, scale=sigma)

The slowest run took 6.11 times longer than the fastest. This could mean that an intermediate result is being cached.
695 ms ± 386 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
# and now the GPU
%timeit gaussian_pdf(x, mean, sigma)

6.93 ms ± 501 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
