In [6]:
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

In [7]:
a_gpu = gpuarray.to_gpu(np.random.randn(4,4).astype(np.float32))
a_doubled = (2*a_gpu).get()
a_doubled

array([[-0.6966533 , -3.7809439 ,  1.3691689 , -1.7882514 ],
       [ 1.7559218 , -1.8855509 ,  0.03559884,  0.6662696 ],
       [ 2.3568423 , -0.8794475 , -0.8797849 , -1.3574759 ],
       [ 0.26086268, -3.4398298 ,  1.7043309 ,  0.57730067]],
      dtype=float32)

In [8]:
#this code runs at least 3 times faster compared to the hipster garbage you're using
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

multiply_them = mod.get_function("multiply_them")

a = np.random.randn(400).astype(np.float32)
b = np.random.randn(400).astype(np.float32)

dest = np.zeros_like(a)
multiply_them(drv.Out(dest), drv.In(a), drv.In(b), block=(400,1,1), grid=(1,1))

#dest - (a*b)

In [9]:
ker = SourceModule("""
__global__ void scalar_multiply_kernel(float *outvec, float scalar, float *vec)
{
    int i = threadIdx.x;
    outvec[i] = scalar * vec[i];
}
""")


scalar_multiply_gpu = ker.get_function("scalar_multiply_kernel")

testvec = np.random.randn(512).astype(np.float32)
testvec_gpu = gpuarray.to_gpu(testvec)
outvec_gpu = gpuarray.empty_like(testvec_gpu)




scalar_multiply_gpu(outvec_gpu, np.float32(2), testvec_gpu, block=(512,1,1), grid=(1,1,1))

np.allclose(outvec_gpu.get(), testvec*2)

True

In [10]:
from pycuda.elementwise import ElementwiseKernel

In [11]:
#wtf is a compiler btw?
gpu_2x_ker = ElementwiseKernel(
"float *in, float *out",
"out[i] = 2*in[i];",
"gpu_2x_kernel")

In [12]:
host_data = np.float32(np.random.random(5_000_000))
host_data_2x = 2 * host_data


device_data = gpuarray.to_gpu(host_data)
device_data_2x = gpuarray.empty_like(device_data)

gpu_2x_ker(device_data, device_data_2x)

from_device = device_data_2x.get()

np.allclose(from_device, host_data_2x)

True

In [13]:
from functools import reduce

In [14]:
#used for associative binary operations
reduce(lambda x, y: x + y, range(1, 5))

10

In [15]:
from pycuda.scan import InclusiveScanKernel
from pycuda.reduction import ReductionKernel

In [16]:
seq = np.array([1, 100, -3, -10000, 4, 10000, 66, 14, 21], dtype=np.int32)
seq_gpu = gpuarray.to_gpu(seq)
max_gpu = InclusiveScanKernel(np.int32, 'a > b ? a : b')

max_gpu(seq_gpu).get()[-1], np.max(seq)

(10000, 10000)

In [17]:
max_gpu(seq_gpu)

array([    1,   100,   100,   100,   100, 10000, 10000, 10000, 10000])

In [18]:
dot_prod = ReductionKernel(np.float32, 
                           neutral='0',
                           reduce_expr='a + b', 
                           map_expr='vec1[i] * vec2[i]',
                           arguments='float *vec1, float *vec2')

In [19]:
array = np.array([1,2,3], dtype=np.float32)
array_gpu = gpuarray.to_gpu(array)

dot_prod(array_gpu, array_gpu), array @ array 

(array(14., dtype=float32), 14.0)