In [1]:
using BenchmarkTools
using CUDAnative, CuArrays
using GPUifyLoops
using PyPlot

In [2]:
# Increment integer with periodic wrapping.
@inline incmod1(a, n) = ifelse(a==n, 1, a+1)

incmod1 (generic function with 1 method)

In [3]:
# x, y, and z difference operators with periodic boundary conditions.
# Nx, Ny, and Nz are the number of grid points in each dimension.
# They return the difference at grid point (i, j, k).
@inline δx(f, Nx, i, j, k) = @inbounds f[incmod1(i, Nx), j, k] - f[i, j, k]
@inline δy(f, Ny, i, j, k) = @inbounds f[i, incmod1(j, Ny), k] - f[i, j, k]
@inline δz(f, Nz, i, j, k) = @inbounds f[i, j, incmod1(k, Nz)] - f[i, j, k]

# 3D Divergence operator.
@inline div(f, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k) = δx(f, Nx, i, j, k) / Δx + δy(f, Ny, i, j, k) / Δy + δz(f, Nz, i, j, k) / Δz

div (generic function with 1 method)

In [4]:
# This is the actual kernel.
function div_kernel(::Val{Dev}, f, div_f) where Dev
    @setup Dev
    
    Nx, Ny, Nz = size(f)
    Δx, Δy, Δz = 1, 1, 1
    
    cpuIndex3D() = CartesianIndices((Nx, Ny, Nz))
    gpuIndex3D() = CartesianIndices(
        (blockIdx().z,
         (blockIdx().y - 1) * blockDim().y + threadIdx().y,
         (blockIdx().x - 1) * blockDim().x + threadIdx().x)
    )
    
    # Calculate the divergence of f at every point and store it in div_f.
    @loop for I in (cpuIndex3D(); gpuIndex3D())
        @inbounds div_f[I] = div(f, Nx, Ny, Nz, Δx, Δy, Δz, I.I...)
    end
    
    @synchronize
    nothing
end

# CPU wrapper.
calc_div(f::Array, div_f::Array) = div_kernel(Val(:CPU), f, div_f)

# GPU wrapper.
function calc_div(f::CuArray, div_f::CuArray)
    Nx, Ny, Nz = size(f)
    
    Tx, Ty = 16, 16  # Threads per block
    Bx, By, Bz = Int(Nx/Tx), Int(Ny/Ty), Nz  # Blocks in grid.
    
    @cuda threads=(Tx, Ty) blocks=(Bx, By, Bz) div_kernel(Val(:GPU), f, div_f)
end

calc_div (generic function with 2 methods)

In [5]:
Nx, Ny, Nz = 1024, 1024, 512
xc, yc = ones(Nx, Ny, Nz), rand(Nx, Ny, Nz);

In [6]:
calc_div(xc, yc)

In [None]:
PyPlot.imshow(Array(yc[:, :, end]))

In [None]:
PyPlot.imshow(rotl90(Array(yc[512, :, :])))

In [7]:
Nx, Ny, Nz = 1024, 1024, 512
xg, yg = cu(ones(Nx, Ny, Nz)), cu(rand(Nx, Ny, Nz));

In [8]:
calc_div(xg, yg)

In [None]:
PyPlot.imshow(Array(yg[:, :, end]))

In [None]:
PyPlot.imshow(rotl90(Array(yg[32, :, :])))

In [None]:
Nx, Ny, Nz = 1024, 1024, 512
xc, yc = rand(Nx, Ny, Nz), rand(Nx, Ny, Nz);
xg, yg = cu(rand(Nx, Ny, Nz)), cu(rand(Nx, Ny, Nz));

In [None]:
@benchmark calc_div($xc, $yc)

In [None]:
@benchmark CuArrays.@sync calc_div($xg, $yg)