In [1]:
using BenchmarkTools
using CUDAnative, CuArrays
using GPUifyLoops

In [2]:
@inline incmod1(a, n) = a == n ? 1 : a+1
δx(f, Nx, i, j, k) = f[incmod1(i, Nx), j, k] - f[i, j, k]

δx (generic function with 1 method)

In [27]:
function time_stepping_kernel(::Val{Dev}, f, δxf) where Dev
    @setup Dev
    
    Nx, Ny, Nz = size(f)
    @loop for i in (1:Nx; threadIdx().x)
        for k in 1:Nz, j in 1:Ny
            δxf[i, j, k] = δx(f, Nx, i, j, k)
        end
    end
    
    @synchronize
end

time_step!(A::Array, B::Array) = time_stepping_kernel(Val(:CPU), A, B)

function time_step!(A::CuArray, B::CuArray)
    @cuda threads=512 time_stepping_kernel(Val(:GPU), A, B)
end

time_step! (generic function with 2 methods)

In [29]:
Nx, Ny, Nz = 128, 128, 64
xc, yc = rand(Nx, Ny, Nz), rand(Nx, Ny, Nz);
xg, yg = cu(rand(Nx, Ny, Nz)), cu(rand(Nx, Ny, Nz));

In [32]:
@benchmark time_step!($xc, $yc)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     5.814 ms (0.00% GC)
  median time:      5.826 ms (0.00% GC)
  mean time:        5.841 ms (0.00% GC)
  maximum time:     7.748 ms (0.00% GC)
  --------------
  samples:          856
  evals/sample:     1

In [33]:
@benchmark time_step!($xg, $yg)

BenchmarkTools.Trial: 
  memory estimate:  288 bytes
  allocs estimate:  7
  --------------
  minimum time:     3.710 μs (0.00% GC)
  median time:      4.658 ms (0.00% GC)
  mean time:        4.329 ms (0.00% GC)
  maximum time:     4.662 ms (0.00% GC)
  --------------
  samples:          145
  evals/sample:     8