In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import time as t

In [3]:
x = np.random.randn(10e7).astype(np.float32)
start = t.time()
valid = np.logical_and(-1<x,x<+1)
total_time_cpu = t.time()-start
print ('CPU: Found %d values in %f secs' % (np.sum(valid) , total_time_cpu))

CPU: Found 68265282 values in 0.142890 secs


In [4]:
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.driver as drv
import pycuda.gpuarray as gpuarray

In [5]:
kernel = SourceModule("""
__global__ void twice(float *x)
{
    const unsigned int i = threadIdx.x  + threadIdx.y*blockDim.x;
    x[i] = 2*x[i];
}
""")
twice = kernel.get_function('twice')

In [6]:
x = np.random.randn(10e7).astype(np.float32)
x_gpu = gpuarray.to_gpu(x)
twice(x_gpu, block=(4,4,1), grid=(1,1))

In [7]:
print(x, np.sum(x))
print(x_gpu.get(),np.float32(gpuarray.sum(x_gpu).get()))

[-1.27332497  0.28459895 -0.66097635 ..., -1.63916337 -0.96839261
 -2.0487504 ] 3859.75
[-2.54664993  0.56919789 -1.3219527  ..., -1.63916337 -0.96839261
 -2.0487504 ] 3862.13


In [8]:
kernel = SourceModule("""
__global__ void threshold(float *x, unsigned int len)
{
    const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int numThreads = blockDim.x * gridDim.x;
    
    for(int i = idx; i < len; i+=numThreads)
        x[i] = -1<x[i] && x[i] < +1 ? 1.0: 0.0;
}
""")
threshold = kernel.get_function('threshold')

In [9]:
start = t.time()
threshold(x_gpu, np.uint32(len(x)), block=(256,1,1), grid=(16,1))
total_time_gpu = t.time()-start

In [10]:
sumvalues = gpuarray.sum(x_gpu).get()
print('GPU:Found %d values in %f secs' % (sumvalues,total_time_gpu))

GPU:Found 68274568 values in 0.000316 secs


In [11]:
speedup = total_time_cpu/total_time_gpu
print('Speedup: %f' % speedup)

Speedup: 452.321509


In [12]:
start = t.time()
threshold(drv.InOut(x), np.uint32(len(x)), block=(256,1,1) , grid=(16,1))
total_time_gpu = t.time() - start

In [13]:
print('GPU: Found %d values in %f secs (automatic conversion)' % (np.sum(x) , total_time_gpu))

GPU: Found 68274552 values in 0.189752 secs (automatic conversion)
