In [None]:
using Pkg
cd("/home/gridsan/aramadhan/Oceananigans.jl/")
Pkg.activate(".");

In [None]:
import GPUifyLoops
using GPUifyLoops: @launch, @loop, @unroll, @synchronize
using Oceananigans
using CUDAnative

In [None]:
function fill_halo_regions!(grid::Grid, fields...)
    Nx, Ny, Nz = grid.Nx, grid.Ny, grid.Nz  # Number of grid points.
    Hx, Hy, Hz = grid.Hx, grid.Hy, grid.Hz  # Size of halo regions.
    
    for f in fields
        for k in 1:Nz
            for j in 1:Ny
                for h in 1:Hx
                    f[1-h,  j, k] = f[Nx-h+1, j, k]
                    f[Nx+h, j, k] = f[h,      j, k]
                end
            end
            for i in 1:Nx
                for h in 1:Hy
                    f[i, 1-h,  k] = f[i, Ny-h+1, k]
                    f[i, Ny+h, k] = f[i,      h, k]
                end
            end
        end
    end
end

In [None]:
function fill_halo_regions_x!(grid::RegularCartesianGrid, fields...)
    Nx, Ny, Nz = grid.Nx, grid.Ny, grid.Nz  # Number of grid points.
    Hx, Hy, Hz = grid.Hx, grid.Hy, grid.Hz  # Size of halo regions.
    
    @loop for k in (1:Nz; blockIdx().z)
        @loop for j in (1:Ny; (blockIdx().y - 1) * blockDim().y + threadIdx().y)
            @unroll for f in fields
                @unroll for h in 1:Hx
                    f[1-h,  j, k] = f[Nx-h+1, j, k]
                    f[Nx+h, j, k] = f[h,      j, k]
                end
            end
        end
    end
    
    @synchronize
end

In [None]:
function fill_halo_regions_y!(grid::Grid, fields...)
    Nx, Ny, Nz = grid.Nx, grid.Ny, grid.Nz  # Number of grid points.
    Hx, Hy, Hz = grid.Hx, grid.Hy, grid.Hz  # Size of halo regions.
    
    @loop for k in (1:Nz; blockIdx().z)
        @loop for i in (1:grid.Nx; (blockIdx().x - 1) * blockDim().x + threadIdx().x)
            @unroll for f in fields
                @unroll for h in 1:Hx
                    f[i,  1-h, k] = f[i, Ny-h+1, k]
                    f[i, Ny+h, k] = f[i,      h, k]
                end
            end
        end
    end
    
    @synchronize
end

In [None]:
Nx, Ny, Nz = 5, 5, 5
model = Model(N=(Nx, Ny, Nz), L=(100, 100, 100));

In [None]:
T, S = model.tracers.T.data, model.tracers.S.data
T .= 0
S .= 0
@views T[1:Nx, 1:Ny, 1:Nz] .= rand.()
@views S[1:Nx, 1:Ny, 1:Nz] .= rand.();

In [None]:
T;

In [None]:
fill_halo_regions_x!(model.grid, T, S)

In [None]:
S

In [None]:
Nx, Ny, Nz = 5, 5, 5
model = Model(N=(Nx, Ny, Nz), L=(100, 100, 100), arch=GPU());

In [None]:
max_threads = 1024

Tx  = min(max_threads, Nx)
Ty  = min(fld(max_threads, Tx), Ny)
Tz  = min(fld(max_threads, Tx*Ty), Nz)

Bx, By, Bz = cld(Nx, Tx), cld(Ny, Ty), cld(Nz, Tz)

@show Tx, Ty, Tz
@show Bx, By, Bz;

In [None]:
T = model.tracers.T.data

In [None]:
@launch device(GPU()) fill_halo_regions_x!(model.grid, model.tracers.T.data, threads=(Tx, Ty, Tz), blocks=(Bx, By, Bz))

In [None]:
function fill_halo_regions_x!(Nx, Ny, Nz, f)    
    @loop for k in (1:Nz; blockIdx().z)
        @loop for j in (1:Ny; (blockIdx().y - 1) * blockDim().y + threadIdx().y)
            f[0,    j, k] = f[Nx, j, k]
            f[Nx+1, j, k] = f[1, j, k]
        end
    end
    
    @synchronize
end

In [None]:
using OffsetArrays
using CuArrays

In [None]:
Nx, Ny, Nz = 5, 5, 5

underlying_data = zeros(Nx+2, Ny+2, Nz)
data = OffsetArray(underlying_data, 0:Nx+1, 0:Ny+1, 1:Nz)

@. @views data[1:Nx, 1:Ny, 1:Nz] = rand();

In [None]:
@launch GPUifyLoops.CPU() fill_halo_regions_x!(Nx, Ny, Nz, data, threads=(Tx, Ty, Tz), blocks=(Bx, By, Bz))

In [None]:
data

In [None]:
underlying_data = CuArray(zeros(Nx+2, Ny+2, Nz))
data = OffsetArray(underlying_data, 0:Nx+1, 0:Ny+1, 1:Nz)

@. @views data[1:Nx, 1:Ny, 1:Nz] = rand();

In [None]:
@launch GPUifyLoops.CUDA() fill_halo_regions_x!(Nx, Ny, Nz, data, threads=(Tx, Ty, Tz), blocks=(Bx, By, Bz))