In [1]:
import numpy as np
import pycuda.autoinit
from pycuda import gpuarray
from time import time
from pycuda.elementwise import ElementwiseKernel

In [2]:
# multiply the *in input array by 2 and put it into the *out output array

#ElementwiseKernel(input, kernel code, kernel name)
gpu_2x_ker = ElementwiseKernel(
"float *in, float *out",
"out[i] = 2*in[i];",
"gpu_2x_ker")

In [3]:
def speedcomparison():
    
    host_data = np.float32( np.random.random(50000000) )
    t1 = time()
    host_data_2x =  host_data * np.float32(2)
    t2 = time()
    
    print('total time to compute on CPU: %f' % (t2 - t1))
    
    device_data = gpuarray.to_gpu(host_data)
    # allocate memory for output
    device_data_2x = gpuarray.empty_like(device_data)
    
    t1 = time()
    gpu_2x_ker(device_data, device_data_2x)
    t2 = time()
    from_device = device_data_2x.get()
    print('total time to compute on GPU: %f' % (t2 - t1))
    
    print('Is the host computation the same as the GPU computation? : {}'.format(np.allclose(from_device, host_data_2x) ))

In [4]:
# first time running is slow on the GPU because the kernel function needs to be compilies
speedcomparison()

total time to compute on CPU: 0.043122
total time to compute on GPU: 0.213794
Is the host computation the same as the GPU computation? : True


In [5]:
# second time is faster
speedcomparison()

total time to compute on CPU: 0.043487
total time to compute on GPU: 0.000075
Is the host computation the same as the GPU computation? : True


In [6]:
# third time is about the same as second
speedcomparison()

total time to compute on CPU: 0.043462
total time to compute on GPU: 0.000070
Is the host computation the same as the GPU computation? : True
