In [1]:
using CUDAnative, CuArrays
using GPUifyLoops
using OffsetArrays
using BenchmarkTools
using Test

In [2]:
import Adapt
Adapt.adapt_structure(to, x::OffsetArray) = OffsetArray(Adapt.adapt(to, parent(x)), x.offsets)

In [None]:
# Increment integer with periodic wrapping.
@inline incmod1(a, n) = ifelse(a==n, 1, a+1)

# x, y, and z difference operators with periodic boundary conditions.
# Nx, Ny, and Nz are the number of grid points in each dimension.
# They return the difference at grid point (i, j, k).
@inline δx(f, Nx, i, j, k) = @inbounds f[incmod1(i, Nx), j, k] - f[i, j, k]
@inline δy(f, Ny, i, j, k) = @inbounds f[i, incmod1(j, Ny), k] - f[i, j, k]
@inline δz(f, Nz, i, j, k) = @inbounds f[i, j, incmod1(k, Nz)] - f[i, j, k]

# 3D Divergence operator.
@inline div(f, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k) = δx(f, Nx, i, j, k) / Δx + δy(f, Ny, i, j, k) / Δy + δz(f, Nz, i, j, k) / Δz

In [None]:
function div_kernel!(f, div_f)
    Nx, Ny, Nz = size(f)
    Δx, Δy, Δz = 1, 1, 1

    # Calculate the divergence of f at every point and store it in div_f.
    @loop for k in (1:Nz; blockIdx().z)
        @loop for j in (1:Ny; (blockIdx().y - 1) * blockDim().y + threadIdx().y)
            @loop for i in (1:Nx; (blockIdx().x - 1) * blockDim().x + threadIdx().x)
                @inbounds div_f[i, j, k] = div(f, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k)
            end
        end
    end

    @synchronize
end

In [None]:
# CPU test
Nx, Ny, Nz = 32, 32, 16
xc, yc = ones(Nx, Ny, Nz), rand(Nx, Ny, Nz);

@launch CPU() div_kernel!(xc, yc)

@test all(yc .== 0)

In [None]:
@benchmark @launch CPU() div_kernel!($xc, $yc)

In [None]:
# GPU test
Nx, Ny, Nz = 32, 32, 16
xg, yg = CuArray(ones(Nx, Ny, Nz)), CuArray(rand(Nx, Ny, Nz));

Tx, Ty = 16, 16  # Threads per block
Bx, By, Bz = Int(Nx/Tx), Int(Ny/Ty), Nz  # Blocks in grid.

@launch CUDA() div_kernel!(xg, yg, threads=(Tx, Ty), blocks=(Bx, By, Bz))

@test all(Array(yg) .== 0)

In [3]:
@inline δx(f, Nx, i, j, k) = @inbounds f[i+1, j, k] - f[i, j, k]
@inline δy(f, Ny, i, j, k) = @inbounds f[i, j+1, k] - f[i, j, k]

@inline function δz(f, Nz, i, j, k)
    if k == Nz
        return 0
    else
        @inbounds return f[i, j, k+1] - f[i, j, k]
    end
end

# 3D Divergence operator.
@inline div(f, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k) = δx(f, Nx, i, j, k) / Δx + δy(f, Ny, i, j, k) / Δy + δz(f, Nz, i, j, k) / Δz

div (generic function with 1 method)

In [4]:
function div_kernel_halo!(Nx, Ny, Nz, f, div_f)
    Δx, Δy, Δz = 1, 1, 1

    # Calculate the divergence of f at every point and store it in div_f.
    @loop for k in (1:Nz; blockIdx().z)
        @loop for j in (1:Ny; (blockIdx().y - 1) * blockDim().y + threadIdx().y)
            @loop for i in (1:Nx; (blockIdx().x - 1) * blockDim().x + threadIdx().x)
                @inbounds div_f[i, j, k] = div(f, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k)
            end
        end
    end

    @synchronize
    nothing
end

div_kernel_halo! (generic function with 1 method)

In [5]:
Nx, Ny, Nz = 32, 32, 16

hs = 1  # halo size
xco = OffsetArray{Float64}(undef, 1-hs:Nx+hs, 1-hs:Ny+hs, 1:Nz);
yco = OffsetArray{Float64}(undef, 1-hs:Nx+hs, 1-hs:Ny+hs, 1:Nz);
xco.parent .= ones(size(xco)); yco.parent .= rand(size(yco)...);

@launch CPU() div_kernel_halo!(Nx, Ny, Nz, xco, yco)

@test all(yco[1:Nx, 1:Ny, 1:Nz] .== 0)

[32m[1mTest Passed[22m[39m

In [None]:
@benchmark @launch CPU() div_kernel_halo!(Nx, Ny, Nz, $xco, $yco)

In [6]:
Nx, Ny, Nz = 32, 32, 16

hs = 1

x_underlying = CuArray{Float32}(ones(Nx+2*hs, Ny+2*hs, Nz))
xgo = OffsetArray(x_underlying, 1-hs:Nx+hs, 1-hs:Ny+hs, 1:Nz);

y_underlying = CuArray{Float32}(rand(Nx+2*hs, Ny+2*hs, Nz))
ygo = OffsetArray(y_underlying, 1-hs:Nx+hs, 1-hs:Ny+hs, 1:Nz);

Tx, Ty = 16, 16  # Threads per block
Bx, By, Bz = Int(Nx/Tx), Int(Ny/Ty), Nz  # Blocks in grid.

@launch CUDA() div_kernel_halo!(Nx, Ny, Nz, xgo, ygo, threads=(Tx, Ty), blocks=(Bx, By, Bz))

CUDAnative.KernelError: GPU compilation of #16(Int64, Int64, Int64, OffsetArray{Float32,3,CuArray{Float32,3}}, OffsetArray{Float32,3,CuArray{Float32,3}}) failed
KernelError: passing and using non-bitstype argument

Argument 5 to your kernel function is of type OffsetArray{Float32,3,CuArray{Float32,3}}.
That type is not isbits, and such arguments are only allowed when they are unused by the kernel.


In [7]:
CUDAnative.timings()

UndefVarError: UndefVarError: timings not defined