In [1]:
using GPUifyLoops

┌ Info: Precompiling GPUifyLoops [8c6e557c-24b2-11e9-113b-4b4a3dc760fa]
└ @ Base loading.jl:1186


In [37]:
function kernel(::Val{Dev}, A) where Dev
    @setup Dev

    @loop for i in (1:size(A,1); threadIdx().x)
        A[i] = 2*A[i]
    end
    @synchronize
end

kernel(A::Array) = kernel(Val(:CPU), A)

@static if Base.find_package("CuArrays") !== nothing
    using CuArrays
    using CUDAnative

    @eval function kernel(A::CuArray)
        @cuda threads=min(length(A), 1024) kernel(Val(:GPU), A)
    end
end

kernel (generic function with 3 methods)

In [41]:
N = 8192;

In [42]:
data_cpu = rand(N);
kernel(data_cpu)

In [43]:
@benchmark kernel(data_cpu)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     4.068 μs (0.00% GC)
  median time:      4.182 μs (0.00% GC)
  mean time:        4.194 μs (0.00% GC)
  maximum time:     151.242 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     8

In [44]:
data_gpu = cu(rand(N))
kernel(data_gpu)

In [45]:
@benchmark kernel(data_gpu)

BenchmarkTools.Trial: 
  memory estimate:  240 bytes
  allocs estimate:  7
  --------------
  minimum time:     3.885 μs (0.00% GC)
  median time:      6.373 μs (0.00% GC)
  mean time:        6.526 μs (6.55% GC)
  maximum time:     4.295 ms (99.58% GC)
  --------------
  samples:          10000
  evals/sample:     8