In [1]:
import numpy as np
import pycuda.autoinit
from pycuda import gpuarray
from pycuda.scan import InclusiveScanKernel
from time import time

In [2]:
#seq = np.array([1,2,3,4],dtype=np.float32)
seq = np.random.normal(0,1,50).astype(np.float32)
seq_gpu = gpuarray.to_gpu(seq)


### InclusiveScanKernel

The InclusiveScanKernel can be used to run binary associative operations parallel on arrays stored on GPUs. <br>

Examples: 

a1+a2+...+an, <br>
a1\*a2\*...\*an, <br>
max(a1,a2,...,an) <br>


In [3]:
sum_gpu = InclusiveScanKernel(np.float32, "a+b")

In [4]:
print(np.cumsum(seq))
print(sum_gpu(seq_gpu).get())


[-0.365399   -1.1777244  -0.02585006  0.47755384 -1.2651286  -1.7261264
 -2.0969682   0.36338258  1.7729613   0.07546878  0.11856723  0.20565614
 -0.945343   -0.29189116  2.5069315   2.3546934   2.5000381   2.8058856
  1.6873157   1.7811344   1.226975    2.320427    2.0022097   2.2759705
  2.7239122   1.8514686   1.9156115   3.4370003   2.6179607   3.9064543
  4.21203     5.1885986   4.7499213   5.76593     5.392536    5.893618
  7.481668    7.1522927   7.7429366   9.500859   10.585716   11.189121
 12.154033   13.202533   12.165617   11.042746   11.929541   12.568664
 12.388678   11.892275  ]
[-0.365399   -1.1777244  -0.02585006  0.47755384 -1.2651286  -1.7261264
 -2.0969682   0.36338258  1.7729614   0.0754689   0.11856735  0.20565629
 -0.9453429  -0.29189104  2.5069315   2.3546934   2.5000384   2.8058858
  1.6873157   1.7811344   1.226975    2.320427    2.0022097   2.2759705
  2.7239125   1.8514688   1.9156117   3.4370003   2.617961    3.9064546
  4.21203     5.1885986   4.7499213   5

### Speed Test

In [5]:
seq_cpu = np.float32( np.random.normal(0,1,50000000) )
seq_gpu = gpuarray.to_gpu(seq_cpu)

In [6]:
t1 = time()
results_cpu=np.sum(seq_cpu)
#results_cpu=np.sort(seq_cpu)
t2 = time()
print('total time to compute on CPU: %f' % (t2 - t1))

total time to compute on CPU: 0.015843


In [7]:
t1 = time()
tmp=sum_gpu(seq_gpu)
t2 = time()
results_gpu=tmp.get()[-1]
print('total time to compute on GPU: %f' % (t2 - t1))

total time to compute on GPU: 0.006409


In [8]:
results_cpu

-1580.8334

In [9]:
results_gpu

-1580.834