In [3]:
using Pkg
# cd("D:\\Home\\Git\\Oceananigans.jl")
# cd("C:\\Users\\Ali\\Documents\\Git\\Oceananigans.jl\\")
cd("/home/gridsan/aramadhan/Oceananigans.jl/")
Pkg.activate(".");

In [4]:
using FFTW, Test, Statistics
using GPUifyLoops
using Oceananigans, Oceananigans.Operators

In [5]:
using CuArrays, CUDAnative

In [4]:
function dct_dim3_gpu!(f)
    Nx, Ny, Nz = size(f)
    f .= cat(f[:, :, 1:2:Nz], f[:, :, Nz:-2:2]; dims=3)
    fft!(f, 3)

    factors = 2 * exp.(collect(-1im*π*(0:Nz-1) / (2*Nz)))
    
    # f .*= repeat(reshape(factors, 1, 1, Nz), Nx, Ny, 1)
    f .*= cu(repeat(reshape(factors, 1, 1, Nz), Nx, Ny, 1))
    
    nothing
end

dct_dim3_gpu! (generic function with 1 method)

In [5]:
function idct_dim3_gpu!(f)
    Nx, Ny, Nz = size(f)
    
    bfactors = exp.(collect(1im*π*(0:Nz-1) / (2*Nz)))
    bfactors[1] *= 0.5

    f .*= cu(repeat(reshape(bfactors, 1, 1, Nz), Nx, Ny, 1))
    ifft!(f, 3)
    
    f .= cu(reshape(permutedims(cat(f[:, :, 1:Int(Nz/2)], f[:, :, end:-1:Int(Nz/2)+1]; dims=4), (1, 2, 4, 3)), Nx, Ny, Nz))
    @. f = real(f)
    
    nothing
end

idct_dim3_gpu! (generic function with 1 method)

In [6]:
function f2ϕ!(::Val{Dev}, Nx, Ny, Nz, f, ϕ, kx², ky², kz²) where Dev
    @setup Dev

    @loop for k in (1:Nz; blockIdx().z)
        @loop for j in (1:Ny; (blockIdx().y - 1) * blockDim().y + threadIdx().y)
            @loop for i in (1:Nx; (blockIdx().x - 1) * blockDim().x + threadIdx().x)
                @inbounds ϕ[i, j, k] = -f[i, j, k] / (kx²[i] + ky²[j] + kz²[k])
            end
        end
    end

    @synchronize
end

f2ϕ! (generic function with 1 method)

In [7]:
Nx, Ny, Nz = 32, 16, 8
Lx, Ly, Lz = 100, 100, 100

Tx, Ty = 16, 16  # Threads per block
Bx, By, Bz = Int(Nx/Tx), Int(Ny/Ty), Nz  # Blocks in grid.

model = Model((Nx, Ny, Nz), (Lx, Ly, Lz));
model_gpu = Model((Nx, Ny, Nz), (Lx, Ly, Lz), :gpu, Float32);

Planning Fourier transforms... (planner_flag=FFTW.PATIENT)
FFT!:    0.144970 seconds (44 allocations: 3.031 KiB)
IFFT!:   0.180569 seconds (16.84 k allocations: 733.172 KiB)
DCT!:    0.015728 seconds (11.50 k allocations: 597.242 KiB)
IDCT!:   0.010615 seconds (10.93 k allocations: 560.273 KiB)


In [8]:
RHS, RHS_orig, ϕ, pNHS, ∇²p = model.stepper_tmp.fCC1, model.stepper_tmp.fC1, model.stepper_tmp.fCC2, model.pressures.pNHS, model.stepper_tmp.fC2

RAND = rand(Nx, Ny, Nz);
RAND .= RAND .- mean(RAND);
RHS_orig.data .= RAND;

RHS.data .= RHS_orig.data;

In [9]:
RHS_gpu, RHS_orig_gpu, ϕ_gpu = model_gpu.stepper_tmp.fCC1, model_gpu.stepper_tmp.fC1, model_gpu.stepper_tmp.fCC2
pNHS_gpu, ∇²p_gpu = model_gpu.pressures.pNHS, model_gpu.stepper_tmp.fC2

RHS_orig_gpu.data .= cu(RAND);
RHS_gpu.data .= RHS_orig_gpu.data;

In [10]:
@test abs(mean(RAND)) < 1e-15
@test RHS.data ≈ RHS_gpu.data

[32m[1mTest Passed[22m[39m

In [11]:
# solve_poisson_3d_ppn!(model.grid, RHS, ϕ)
# solve_poisson_3d_ppn_gpu!(Tx, Ty, Bx, By, Bz, model_gpu.grid, RHS_gpu, ϕ_gpu, kx², ky², kz²)

In [12]:
g, g_gpu = model.grid, model_gpu.grid
f, f_gpu = RHS, RHS_gpu;

In [13]:
kx² = zeros(g.Nx, 1)
ky² = zeros(g.Ny, 1)
kz² = zeros(g.Nz, 1)

for i in 1:g.Nx; kx²[i] = (2sin((i-1)*π/g.Nx)    / (g.Lx/g.Nx))^2; end
for j in 1:g.Ny; ky²[j] = (2sin((j-1)*π/g.Ny)    / (g.Ly/g.Ny))^2; end
for k in 1:g.Nz; kz²[k] = (2sin((k-1)*π/(2g.Nz)) / (g.Lz/g.Nz))^2; end

kx²_gpu = cu(zeros(g.Nx, 1))
ky²_gpu = cu(zeros(g.Ny, 1))
kz²_gpu = cu(zeros(g.Nz, 1))

for i in 1:g.Nx; kx²_gpu[i] = (2sin((i-1)*π/g.Nx)    / (g.Lx/g.Nx))^2; end
for j in 1:g.Ny; ky²_gpu[j] = (2sin((j-1)*π/g.Ny)    / (g.Ly/g.Ny))^2; end
for k in 1:g.Nz; kz²_gpu[k] = (2sin((k-1)*π/(2g.Nz)) / (g.Lz/g.Nz))^2; end

@test kx² ≈ kx²_gpu
@test ky² ≈ ky²_gpu
@test kz² ≈ kz²_gpu

[32m[1mTest Passed[22m[39m

In [14]:
FFTW.r2r!(f.data, FFTW.REDFT10, 3)

dct_dim3_gpu!(f_gpu.data)
@. f_gpu.data = real(f_gpu.data)

f3_dis = sum(.!(real.(f.data) .≈ real.(Array(f_gpu.data)))); println("f3 disagreement: $f3_dis/$(Nx*Ny*Nz)");

###

FFTW.fft!(f.data, [1, 2])
fft!(f_gpu.data, [1, 2])

fh_dis = sum(.!(f.data .≈ Array(f_gpu.data))); println("fh disagreement: $fh_dis/$(Nx*Ny*Nz)");
# @test f.data ≈ f_gpu.data

###

for k in 1:g.Nz, j in 1:g.Ny, i in 1:g.Nx
    @inbounds ϕ.data[i, j, k] = -f.data[i, j, k] / (kx²[i] + ky²[j] + kz²[k])
end
ϕ.data[1, 1, 1] = 0

@cuda threads=(Tx, Ty) blocks=(Bx, By, Bz) f2ϕ!(Val(:GPU), g_gpu.Nx, g_gpu.Ny, g_gpu.Nz, f_gpu.data, ϕ_gpu.data, kx²_gpu, ky²_gpu, kz²_gpu)
ϕ_gpu.data[1, 1, 1] = 0

ϕh_dis = sum(.!(ϕ.data .≈ Array(ϕ_gpu.data))); println("ϕh disagreement: $ϕh_dis/$(Nx*Ny*Nz)");

###

FFTW.ifft!(ϕ.data, [1, 2])
ifft!(ϕ_gpu.data, [1, 2])

ϕ3_dis = sum(.!(ϕ.data .≈ Array(ϕ_gpu.data))); println("ϕ3 disagreement: $ϕ3_dis/$(Nx*Ny*Nz)");

###

@. ϕ.data = real(ϕ.data) / (2g.Nz)
FFTW.r2r!(ϕ.data, FFTW.REDFT01, 3)

idct_dim3_gpu!(ϕ_gpu.data)

ϕ_dis = sum(.!(ϕ.data .≈ Array(ϕ_gpu.data))); println("ϕ  disagreement: $ϕ_dis/$(Nx*Ny*Nz)");

# ###

@. pNHS.data = real(ϕ.data)
@. pNHS_gpu.data = real(ϕ_gpu.data)
p_dis = sum(.!(pNHS.data .≈ Array(pNHS_gpu.data))); println("p  disagreement: $p_dis/$(Nx*Ny*Nz)");

f3 disagreement: 0/4096
fh disagreement: 1/4096
ϕh disagreement: 0/4096
ϕ3 disagreement: 1/4096
ϕ  disagreement: 1/4096
p  disagreement: 1/4096


In [15]:
@show sum(ϕ.data), sum(ϕ_gpu.data)
@show mean(ϕ.data), mean(ϕ_gpu.data);

(sum(ϕ.data), sum(ϕ_gpu.data)) = (-1.8189894035458565e-12 + 0.0im, 0.003162384f0 + 0.0f0im)
(mean(ϕ.data), mean(ϕ_gpu.data)) = (-4.440892098500626e-16 + 0.0im, 7.720664f-7 + 0.0f0im)


In [16]:
@inline incmod1(a, n) = a == n ? one(a) : a + 1
@inline decmod1(a, n) = a == 1 ? n : a - 1

function ∇²_ppn(g, f)
    Nx, Ny, Nz = g.Nx, g.Ny, g.Nz
    Δx, Δy, Δz = g.Δx, g.Δy, g.Δz
    ∇²f = zeros(Nx, Ny, Nz)
    for k in 2:(Nz-1), j in 1:Ny, i in 1:Nx
       ∇²f[i, j, k] = (f[incmod1(i, Nx), j, k] - 2*f[i, j, k] + f[decmod1(i, Nx), j, k]) / Δx^2 +
                      (f[i, incmod1(j, Ny), k] - 2*f[i, j, k] + f[i, decmod1(j, Ny), k]) / Δy^2 +
                      (f[i, j, k+1]            - 2*f[i, j, k] + f[i, j, k-1])            / Δz^2
    end
    for j in 1:Ny, i in 1:Nx
        ∇²f[i, j,   1] = (f[i, j, 2] - f[i, j, 1]) / Δz^2 +
                         (f[incmod1(i, Nx), j, 1] - 2*f[i, j, 1] + f[decmod1(i, Nx), j, 1]) / Δx^2 +
                         (f[i, incmod1(j, Ny), 1] - 2*f[i, j, 1] + f[i, decmod1(j, Ny), 1]) / Δy^2
        ∇²f[i, j, end] = (f[i, j, end-1] - f[i, j, end]) / Δz^2 +
                         (f[incmod1(i, Nx), j, end] - 2*f[i, j, end] + f[decmod1(i, Nx), j, end]) / Δx^2 +
                         (f[i, incmod1(j, Ny), end] - 2*f[i, j, end] + f[i, decmod1(j, Ny), end]) / Δy^2
    end
    ∇²f
end

∇²_ppn (generic function with 1 method)

In [17]:
@test Float32.(∇²_ppn(g, pNHS.data)) ≈ RAND

[32m[1mTest Passed[22m[39m

In [18]:
@test Float32.(∇²_ppn(g, Array(pNHS_gpu.data))) ≈ RAND

[32m[1mTest Passed[22m[39m

In [84]:
function dct_permute!(::Val{Dev}, Nx, Ny, Nz, A, B) where Dev
    @setup Dev

    @loop for k in (1:Nz; blockIdx().z)
        @loop for j in (1:Ny; (blockIdx().y - 1) * blockDim().y + threadIdx().y)
            @loop for i in (1:Nx; (blockIdx().x - 1) * blockDim().x + threadIdx().x)
                if CUDAnative.ffs(k) == 1  # isodd(k)
                    @inbounds B[i, j, convert(UInt32, CUDAnative.floor(k/2) + 1)] = A[i, j, k]
                else
                    @inbounds B[i, j, convert(UInt32, Nz - CUDAnative.floor((k-1)/2))] = A[i, j, k]
                end
            end
        end
    end

    @synchronize
end

function dct_dim3_gpu!(f)
    Nx, Ny, Nz = size(f)
    
    # f .= cat(f[:, :, 1:2:Nz], f[:, :, Nz:-2:2]; dims=3)
    g = CuArray{eltype(f)}(undef, size(f))
    @cuda threads=(Tx, Ty) blocks=(Bx, By, Bz) dct_permute!(Val(:GPU), Nx, Ny, Nz, f, g)
    f .= g
    
    fft!(f, 3)

    factors = 2 * exp.(collect(-1im*π*(0:Nz-1) / (2*Nz)))
    
    # f .*= repeat(reshape(factors, 1, 1, Nz), Nx, Ny, 1)
    f .*= cu(repeat(reshape(factors, 1, 1, Nz), Nx, Ny, 1))
    
    nothing
end

function idct_permute!(::Val{Dev}, Nx, Ny, Nz, A, B) where Dev
    @setup Dev

    @loop for k in (1:Nz; blockIdx().z)
        @loop for j in (1:Ny; (blockIdx().y - 1) * blockDim().y + threadIdx().y)
            @loop for i in (1:Nx; (blockIdx().x - 1) * blockDim().x + threadIdx().x)
                if k <= Nz/2
                    @inbounds B[i, j, 2k-1] = A[i, j, k]
                else
                    @inbounds B[i, j, 2(Nz-k+1)] = A[i, j, k]
                end
            end
        end
    end

    @synchronize
end

function idct_dim3_gpu!(f)
    Nx, Ny, Nz = size(f)
    
    bfactors = exp.(collect(1im*π*(0:Nz-1) / (2*Nz)))
    bfactors[1] *= 0.5

    f .*= cu(repeat(reshape(bfactors, 1, 1, Nz), Nx, Ny, 1))
    ifft!(f, 3)
    
    # f .= cu(reshape(permutedims(cat(f[:, :, 1:Int(Nz/2)], f[:, :, end:-1:Int(Nz/2)+1]; dims=4), (1, 2, 4, 3)), Nx, Ny, Nz))
    # @. f = real(f)  # Don't do it here. We'll do it when assigning real(ϕ) to pNHS to save some measly FLOPS.
    
    g = CuArray{eltype(f)}(undef, size(f))
    @cuda threads=(Tx, Ty) blocks=(Bx, By, Bz) idct_permute!(Val(:GPU), Nx, Ny, Nz, f, g)
    f .= g
    
    nothing
end

function solve_poisson_3d_ppn_gpu_NEW!(Tx, Ty, Bx, By, Bz, g::RegularCartesianGrid, f::CellField, ϕ::CellField, kx², ky², kz²)
    dct_dim3_gpu!(f.data)
    @. f.data = real(f.data)
    
    fft!(f.data, [1, 2])

    @cuda threads=(Tx, Ty) blocks=(Bx, By, Bz) f2ϕ!(Val(:GPU), g.Nx, g.Ny, g.Nz, f.data, ϕ.data, kx², ky², kz²)
    ϕ.data[1, 1, 1] = 0

    ifft!(ϕ.data, [1, 2])
    idct_dim3_gpu!(ϕ.data)

    nothing
end

function f2ϕ!(::Val{Dev}, Nx, Ny, Nz, f, ϕ, kx², ky², kz²) where Dev
    @setup Dev

    @loop for k in (1:Nz; blockIdx().z)
        @loop for j in (1:Ny; (blockIdx().y - 1) * blockDim().y + threadIdx().y)
            @loop for i in (1:Nx; (blockIdx().x - 1) * blockDim().x + threadIdx().x)
                @inbounds ϕ[i, j, k] = -f[i, j, k] / (kx²[i] + ky²[j] + kz²[k])
            end
        end
    end

    @synchronize
end

f2ϕ! (generic function with 1 method)

In [85]:
Nx, Ny, Nz = 32, 16, 8
Lx, Ly, Lz = 100, 100, 100

Tx, Ty = 16, 16  # Threads per block
Bx, By, Bz = Int(Nx/Tx), Int(Ny/Ty), Nz  # Blocks in grid.

model_gpu = Model((Nx, Ny, Nz), (Lx, Ly, Lz), :gpu, Float32);

g = model_gpu.grid

kx²_gpu = cu(zeros(g.Nx, 1))
ky²_gpu = cu(zeros(g.Ny, 1))
kz²_gpu = cu(zeros(g.Nz, 1))

for i in 1:g.Nx; kx²_gpu[i] = (2sin((i-1)*π/g.Nx)    / (g.Lx/g.Nx))^2; end
for j in 1:g.Ny; ky²_gpu[j] = (2sin((j-1)*π/g.Ny)    / (g.Ly/g.Ny))^2; end
for k in 1:g.Nz; kz²_gpu[k] = (2sin((k-1)*π/(2g.Nz)) / (g.Lz/g.Nz))^2; end

RHS_orig = model_gpu.stepper_tmp.fC1
RHS      = model_gpu.stepper_tmp.fCC1
ϕ        = model_gpu.stepper_tmp.fCC2
pNHS     = CellField(model_gpu.metadata, model_gpu.grid, Float32)
∇²ϕ      = CellField(model_gpu.metadata, model_gpu.grid, Float32)

RHS_orig.data .= cu(rand(Float32, Nx, Ny, Nz))
RHS_orig.data .= RHS_orig.data .- mean(RHS_orig.data)
RHS.data .= RHS_orig.data

@show mean(RHS_orig.data)
@show mean(RHS.data)

@show @test abs(mean(RHS_orig.data)) < 10*eps(Float32)

solve_poisson_3d_ppn_gpu_NEW!(Tx, Ty, Bx, By, Bz, model_gpu.grid, RHS, ϕ, kx²_gpu, ky²_gpu, kz²_gpu)

pNHS.data .= real.(ϕ.data)

Oceananigans.Operators.∇²_ppn!(model_gpu.grid, pNHS, ∇²ϕ)

print("Correct: " * string(sum(∇²ϕ.data .≈ RHS_orig.data)) * "/" * string(Nx*Ny*Nz))

mean(RHS_orig.data) = 3.8955477f-8
mean(RHS.data) = 4.193861968815327e-8 + 0.0im
#= In[85]:32 =# @test(abs(mean(RHS_orig.data)) < 10 * eps(Float32)) = Test Passed
Correct: 4093/4096