In [2]:
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

In [3]:
a_gpu = gpuarray.to_gpu(np.random.randn(4,4).astype(np.float32))
a_doubled = (2*a_gpu).get()
a_doubled

array([[ 0.25713393, -3.8401983 , -2.9935155 , -0.17957465],
       [-0.6404688 ,  3.0378282 ,  2.0784652 , -0.22969635],
       [ 2.8222938 ,  0.54550064,  0.648804  ,  1.7144997 ],
       [-2.2465334 , -0.49560645,  1.4804364 , -1.4370807 ]],
      dtype=float32)

In [4]:
#this code runs at least 99_999_999_999 times faster compared to the hipster garbage you're using
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

multiply_them = mod.get_function("multiply_them")

a = np.random.randn(400).astype(np.float32)
b = np.random.randn(400).astype(np.float32)

dest = np.zeros_like(a)
multiply_them(drv.Out(dest), drv.In(a), drv.In(b), block=(400,1,1), grid=(1,1))

dest - (a*b)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [None]:
ker = SourceModul("""
__global__ void scalr_multiply_kernel(float *outvec, float scalar, float *vec)
{
    int i = threadIdx.x;
    outvec[i] = scalar * vec[i];
}
""")

In [5]:
from pycuda.elementwise import ElementwiseKernel

In [6]:
#wtf is a compiler btw? we just wrap
gpu_2x_ker = ElementwiseKernel(
"float *in, float *out",
"out[i] = 2*in[i];",
"gpu_2x_kernel")

In [7]:
host_data = np.float32(np.random.random(5_000_000))
host_data_2x = 2 * host_data


device_data = gpuarray.to_gpu(host_data)
device_data_2x = gpuarray.empty_like(device_data)

gpu_2x_ker(device_data, device_data_2x)

from_device = device_data_2x.get()

np.allclose(from_device, host_data_2x)

True

In [8]:
from functools import reduce

In [9]:
#used for associative binary operations
reduce(lambda x, y: x + y, range(1, 5))

10

In [10]:
from pycuda.scan import InclusiveScanKernel
from pycuda.reduction import ReductionKernel

In [11]:
seq = np.array([1, 100, -3, -10000, 4, 10000, 66, 14, 21], dtype=np.int32)
seq_gpu = gpuarray.to_gpu(seq)
max_gpu = InclusiveScanKernel(np.int32, 'a > b ? a : b')

max_gpu(seq_gpu).get()[-1], np.max(seq)

(10000, 10000)

In [12]:
max_gpu(seq_gpu)

array([    1,   100,   100,   100,   100, 10000, 10000, 10000, 10000])

In [17]:
dot_prod = ReductionKernel(np.float32, 
                           neutral='0',
                           reduce_expr='a + b', 
                           map_expr='vec1[i] * vec2[i]',
                           arguments='float *vec1, float *vec2')

In [18]:
array = np.array([1,2,3], dtype=np.float32)
array_gpu = gpuarray.to_gpu(array)

dot_prod(array_gpu, array_gpu), array @ array 

(array(14., dtype=float32), 14.0)