In [1]:
using Pkg
# cd("D:\\Home\\Git\\Oceananigans.jl")
# cd("C:\\Users\\Ali\\Documents\\Git\\Oceananigans.jl\\")
cd("/home/gridsan/aramadhan/Oceananigans.jl/")
Pkg.activate(".");

In [2]:
using FFTW, Test, Statistics
using GPUifyLoops
using Oceananigans, Oceananigans.Operators

In [3]:
using CuArrays, CUDAnative

In [4]:
function dct_dim3_gpu!(f)
    Nx, Ny, Nz = size(f)
    f .= cat(f[:, :, 1:2:Nz], f[:, :, Nz:-2:2]; dims=3)
    fft!(f, 3)

    factors = 2 * exp.(collect(-1im*π*(0:Nz-1) / (2*Nz)))
    
    # f .*= repeat(reshape(factors, 1, 1, Nz), Nx, Ny, 1)
    f .*= cu(repeat(reshape(factors, 1, 1, Nz), Nx, Ny, 1))
    
    nothing
end

dct_dim3_gpu! (generic function with 1 method)

In [191]:
function idct_dim3_gpu!(f)
    Nx, Ny, Nz = size(f)
    
    bfactors = exp.(collect(1im*π*(0:Nz-1) / (2*Nz)))
    bfactors[1] *= 0.5

    f .*= cu(repeat(reshape(bfactors, 1, 1, Nz), Nx, Ny, 1))
    ifft!(f, 3)
    f .= cu(reshape(permutedims(cat(f[:, :, 1:Int(Nz/2)], f[:, :, end:-1:Int(Nz/2)+1]; dims=4), (1, 2, 4, 3)), Nx, Ny, Nz))
    @. f = real(f)
    
    nothing
end

idct_dim3_gpu! (generic function with 1 method)

In [6]:
function f2ϕ!(::Val{Dev}, Nx, Ny, Nz, f, ϕ, kx², ky², kz²) where Dev
    @setup Dev

    @loop for k in (1:Nz; blockIdx().z)
        @loop for j in (1:Ny; (blockIdx().y - 1) * blockDim().y + threadIdx().y)
            @loop for i in (1:Nx; (blockIdx().x - 1) * blockDim().x + threadIdx().x)
                @inbounds ϕ[i, j, k] = -f[i, j, k] / (kx²[i] + ky²[j] + kz²[k])
            end
        end
    end

    @synchronize
end

f2ϕ! (generic function with 1 method)

In [7]:
Nx, Ny, Nz = 32, 16, 8
Lx, Ly, Lz = 100, 100, 100

Tx, Ty = 16, 16  # Threads per block
Bx, By, Bz = Int(Nx/Tx), Int(Ny/Ty), Nz  # Blocks in grid.

model = Model((Nx, Ny, Nz), (Lx, Ly, Lz));
model_gpu = Model((Nx, Ny, Nz), (Lx, Ly, Lz), :gpu, Float32);

Planning Fourier transforms... (planner_flag=FFTW.PATIENT)
FFT!:    0.154888 seconds (44 allocations: 3.031 KiB)
IFFT!:   0.189867 seconds (16.84 k allocations: 733.172 KiB)
DCT!:    0.016050 seconds (11.50 k allocations: 597.242 KiB)
IDCT!:   0.010673 seconds (10.93 k allocations: 560.273 KiB)


In [192]:
RHS, RHS_orig, ϕ, pNHS, ∇²p = model.stepper_tmp.fCC1, model.stepper_tmp.fC1, model.stepper_tmp.fCC2, model.pressures.pNHS, model.stepper_tmp.fC2

RAND = rand(Nx, Ny, Nz);
RAND .= RAND .- mean(RAND);
RHS_orig.data .= RAND;

RHS.data .= RHS_orig.data;

In [193]:
RHS_gpu, RHS_orig_gpu, ϕ_gpu = model_gpu.stepper_tmp.fCC1, model_gpu.stepper_tmp.fC1, model_gpu.stepper_tmp.fCC2
pNHS_gpu, ∇²p_gpu = model_gpu.pressures.pNHS, model_gpu.stepper_tmp.fC2

RHS_orig_gpu.data .= cu(RAND);
RHS_gpu.data .= RHS_orig_gpu.data;

In [194]:
@test abs(mean(RAND)) < 1e-15
@test RHS.data ≈ RHS_gpu.data

[32m[1mTest Passed[22m[39m

In [195]:
# solve_poisson_3d_ppn!(model.grid, RHS, ϕ)
# solve_poisson_3d_ppn_gpu!(Tx, Ty, Bx, By, Bz, model_gpu.grid, RHS_gpu, ϕ_gpu, kx², ky², kz²)

In [196]:
g, g_gpu = model.grid, model_gpu.grid
f, f_gpu = RHS, RHS_gpu;

In [197]:
kx² = zeros(g.Nx, 1)
ky² = zeros(g.Ny, 1)
kz² = zeros(g.Nz, 1)

for i in 1:g.Nx; kx²[i] = (2sin((i-1)*π/g.Nx)    / (g.Lx/g.Nx))^2; end
for j in 1:g.Ny; ky²[j] = (2sin((j-1)*π/g.Ny)    / (g.Ly/g.Ny))^2; end
for k in 1:g.Nz; kz²[k] = (2sin((k-1)*π/(2g.Nz)) / (g.Lz/g.Nz))^2; end

kx²_gpu = cu(zeros(g.Nx, 1))
ky²_gpu = cu(zeros(g.Ny, 1))
kz²_gpu = cu(zeros(g.Nz, 1))

for i in 1:g.Nx; kx²_gpu[i] = (2sin((i-1)*π/g.Nx)    / (g.Lx/g.Nx))^2; end
for j in 1:g.Ny; ky²_gpu[j] = (2sin((j-1)*π/g.Ny)    / (g.Ly/g.Ny))^2; end
for k in 1:g.Nz; kz²_gpu[k] = (2sin((k-1)*π/(2g.Nz)) / (g.Lz/g.Nz))^2; end

@test kx² ≈ kx²_gpu
@test ky² ≈ ky²_gpu
@test kz² ≈ kz²_gpu

[32m[1mTest Passed[22m[39m

In [198]:
FFTW.r2r!(f.data, FFTW.REDFT10, 3)

dct_dim3_gpu!(f_gpu.data)
@. f_gpu.data = real(f_gpu.data)

f3_dis = sum(.!(real.(f.data) .≈ real.(Array(f_gpu.data)))); println("f3 disagreement: $f3_dis/$(Nx*Ny*Nz)");

###

FFTW.fft!(f.data, [1, 2])
fft!(f_gpu.data, [1, 2])

fh_dis = sum(.!(f.data .≈ Array(f_gpu.data))); println("fh disagreement: $fh_dis/$(Nx*Ny*Nz)");
# @test f.data ≈ f_gpu.data

###

for k in 1:g.Nz, j in 1:g.Ny, i in 1:g.Nx
    @inbounds ϕ.data[i, j, k] = -f.data[i, j, k] / (kx²[i] + ky²[j] + kz²[k])
end
ϕ.data[1, 1, 1] = 0

@cuda threads=(Tx, Ty) blocks=(Bx, By, Bz) f2ϕ!(Val(:GPU), g_gpu.Nx, g_gpu.Ny, g_gpu.Nz, f_gpu.data, ϕ_gpu.data, kx²_gpu, ky²_gpu, kz²_gpu)
ϕ_gpu.data[1, 1, 1] = 0

ϕh_dis = sum(.!(ϕ.data .≈ Array(ϕ_gpu.data))); println("ϕh disagreement: $ϕh_dis/$(Nx*Ny*Nz)");

###

FFTW.ifft!(ϕ.data, [1, 2])
ifft!(ϕ_gpu.data, [1, 2])

ϕ3_dis = sum(.!(ϕ.data .≈ Array(ϕ_gpu.data))); println("ϕ3 disagreement: $ϕ3_dis/$(Nx*Ny*Nz)");

###

@. ϕ.data = real(ϕ.data) / (2g.Nz)
FFTW.r2r!(ϕ.data, FFTW.REDFT01, 3)

idct_dim3_gpu!(ϕ_gpu.data)

ϕ_dis = sum(.!(ϕ.data .≈ Array(ϕ_gpu.data))); println("ϕ  disagreement: $ϕ_dis/$(Nx*Ny*Nz)");

# ###

# @. pNHS.data = real(ϕ.data)
# @. pNHS_gpu.data = real(ϕ_gpu.data)
# p_dis = sum(.!(pNHS.data .≈ Array(pNHS_gpu.data))); println("p  disagreement: $p_dis/$(Nx*Ny*Nz)");

f3 disagreement: 0/4096
fh disagreement: 1/4096
ϕh disagreement: 0/4096
ϕ3 disagreement: 2/4096
ϕ  disagreement: 1/4096


In [199]:
@show sum(ϕ.data), sum(ϕ_gpu.data)
@show mean(ϕ.data), mean(ϕ_gpu.data);

(sum(ϕ.data), sum(ϕ_gpu.data)) = (-2.2737367544323206e-12 + 0.0im, -0.00023651123f0 + 0.0f0im)
(mean(ϕ.data), mean(ϕ_gpu.data)) = (-5.551115123125783e-16 + 0.0im, -5.7742f-8 + 0.0f0im)


In [200]:
ϕ.data[:, :, 5]

32×16 Array{Complex{Float64},2}:
    2.87261+0.0im    1.04568+0.0im  …   0.672867+0.0im     2.60443+0.0im
    5.31163+0.0im    3.95408+0.0im       1.08577+0.0im      4.7882+0.0im
    6.80871+0.0im     3.5286+0.0im      0.147403+0.0im     6.63486+0.0im
    7.09767+0.0im    1.21478+0.0im      0.455125+0.0im     5.76927+0.0im
    6.07773+0.0im     1.2994+0.0im       2.15741+0.0im      4.1988+0.0im
    2.78456+0.0im    0.57212+0.0im  …    1.42826+0.0im      1.4194+0.0im
    2.02292+0.0im   0.492382+0.0im       2.30813+0.0im      1.4599+0.0im
    1.16826+0.0im  -0.077845+0.0im       1.08782+0.0im    -1.70493+0.0im
 -0.0146837+0.0im    1.13906+0.0im       1.19105+0.0im    -3.59207+0.0im
  -0.405936+0.0im   0.452514+0.0im       -2.0137+0.0im      -4.224+0.0im
   -3.74057+0.0im   0.121658+0.0im  …   -3.47341+0.0im    -3.08093+0.0im
   -3.67216+0.0im   0.485419+0.0im      -4.02874+0.0im    -2.99068+0.0im
    -4.0428+0.0im   -1.75454+0.0im      -2.33733+0.0im    -3.72248+0.0im
           ⋮      

In [201]:
ϕ_gpu.data[:, :, 5] ./ (2Nz)

32×16 CuArray{Complex{Float32},2}:
     0.179538+0.0im  …   0.0420541+0.0im     0.162777+0.0im
     0.331977+0.0im      0.0678605+0.0im     0.299262+0.0im
     0.425544+0.0im     0.00921267+0.0im     0.414679+0.0im
     0.443604+0.0im      0.0284452+0.0im     0.360579+0.0im
     0.379858+0.0im       0.134838+0.0im     0.262425+0.0im
     0.174035+0.0im  …   0.0892661+0.0im    0.0887125+0.0im
     0.126433+0.0im       0.144258+0.0im    0.0912435+0.0im
     0.073016+0.0im      0.0679885+0.0im    -0.106558+0.0im
 -0.000917807+0.0im      0.0744408+0.0im    -0.224504+0.0im
    -0.025371+0.0im      -0.125856+0.0im       -0.264+0.0im
    -0.233785+0.0im  …   -0.217088+0.0im    -0.192558+0.0im
     -0.22951+0.0im      -0.251796+0.0im    -0.186917+0.0im
    -0.252675+0.0im      -0.146083+0.0im    -0.232655+0.0im
             ⋮       ⋱                               ⋮     
      0.35851+0.0im  …    0.161258+0.0im     0.407634+0.0im
      0.21907+0.0im       0.313112+0.0im     0.405113+0.0im
     

In [202]:
Array(ϕ_gpu.data[:, :, 5]) ./ ϕ.data[:, :, 5]

32×16 Array{Complex{Float64},2}:
      1.0+0.0im       1.0+0.0im  …  0.999999+0.0im       1.0+0.0im
      1.0+0.0im       1.0+0.0im     0.999999+0.0im       1.0+0.0im
      1.0+0.0im       1.0+0.0im     0.999998+0.0im       1.0+0.0im
      1.0+0.0im  0.999999+0.0im     0.999998+0.0im       1.0+0.0im
      1.0+0.0im  0.999999+0.0im          1.0+0.0im       1.0+0.0im
 0.999999+0.0im  0.999997+0.0im  …       1.0+0.0im  0.999999+0.0im
 0.999999+0.0im  0.999998+0.0im     0.999999+0.0im  0.999999+0.0im
 0.999999+0.0im       1.0-0.0im     0.999999+0.0im       1.0-0.0im
  1.00008-0.0im  0.999999+0.0im     0.999999+0.0im       1.0-0.0im
      1.0-0.0im  0.999999+0.0im          1.0-0.0im       1.0-0.0im
      1.0-0.0im  0.999992+0.0im  …       1.0-0.0im       1.0-0.0im
      1.0-0.0im  0.999999+0.0im          1.0-0.0im       1.0-0.0im
      1.0-0.0im       1.0-0.0im          1.0-0.0im       1.0-0.0im
         ⋮                       ⋱                          ⋮     
      1.0+0.0im       1.0-0.0