In [1]:
using Pkg
# cd("D:\\Home\\Git\\Oceananigans.jl")
# cd("C:\\Users\\Ali\\Documents\\Git\\Oceananigans.jl\\")
cd("/home/gridsan/aramadhan/Oceananigans.jl/")
Pkg.activate(".");

In [105]:
using FFTW, Test, Statistics
using GPUifyLoops
using Oceananigans, Oceananigans.Operators

In [110]:
using CuArrays, CUDAnative

In [6]:
function dct_dim3_gpu!(f)
    Nx, Ny, Nz = size(f)
    f .= cat(f[:, :, 1:2:Nz], f[:, :, Nz:-2:2]; dims=3)
    fft!(f, 3)

    factors = 2 * exp.(collect(-1im*π*(0:Nz-1) / (2*Nz)))
    
    # f .*= repeat(reshape(factors, 1, 1, Nz), Nx, Ny, 1)
    f .*= cu(repeat(reshape(factors, 1, 1, Nz), Nx, Ny, 1))
    
    nothing
end

dct_dim3_gpu! (generic function with 1 method)

In [7]:
function idct_dim3_gpu!(f)
    Nx, Ny, Nz = size(f)
    
    bfactors = 0.5 * exp.(collect(1im*π*(0:Nz-1) / (2*Nz)))
    # f .*= repeat(reshape(bfactors, 1, 1, Nz), Nx, Ny, 1)
    f .*= cu(repeat(reshape(bfactors, 1, 1, Nz), Nx, Ny, 1))
    
    ifft!(f, 3)
    
    # f = cat(f[:, :, 1:Int(Nz/2)], f[:, :, end:-1:Int(Nz/2)+1]; dims=4)
    # f = reshape(permutedims(f, (1, 2, 4, 3)), Nx, Ny, Nz)
    # f .= reshape(permutedims(cat(f[:, :, 1:Int(Nz/2)], f[:, :, end:-1:Int(Nz/2)+1]; dims=4), (1, 2, 4, 3)), Nx, Ny, Nz)
    f .= cu(reshape(permutedims(cat(f[:, :, 1:Int(Nz/2)], f[:, :, end:-1:Int(Nz/2)+1]; dims=4), (1, 2, 4, 3)), Nx, Ny, Nz))
    
    nothing
end

idct_dim3_gpu! (generic function with 1 method)

In [106]:
function f2ϕ!(::Val{Dev}, Nx, Ny, Nz, f, ϕ, kx², ky², kz²) where Dev
    @setup Dev

    @loop for k in (1:Nz; blockIdx().z)
        @loop for j in (1:Ny; (blockIdx().y - 1) * blockDim().y + threadIdx().y)
            @loop for i in (1:Nx; (blockIdx().x - 1) * blockDim().x + threadIdx().x)
                @inbounds ϕ[i, j, k] = -f[i, j, k] / (kx²[i] + ky²[j] + kz²[k])
            end
        end
    end

    @synchronize
end

f2ϕ! (generic function with 1 method)

In [162]:
Nx, Ny, Nz = 32, 16, 8
Lx, Ly, Lz = 100, 100, 100

Tx, Ty = 16, 16  # Threads per block
Bx, By, Bz = Int(Nx/Tx), Int(Ny/Ty), Nz  # Blocks in grid.

model = Model((Nx, Ny, Nz), (Lx, Ly, Lz));
model_gpu = Model((Nx, Ny, Nz), (Lx, Ly, Lz), :gpu, Float32);

Planning Fourier transforms... (planner_flag=FFTW.PATIENT)
FFT!:    0.000106 seconds (44 allocations: 3.031 KiB)
IFFT!:   0.000081 seconds (52 allocations: 3.438 KiB)
DCT!:    0.000053 seconds (57 allocations: 4.000 KiB)
IDCT!:   0.000037 seconds (57 allocations: 4.000 KiB)


In [380]:
RHS, RHS_orig, ϕ, pNHS, ∇²p = model.stepper_tmp.fCC1, model.stepper_tmp.fC1, model.stepper_tmp.fCC2, model.pressures.pNHS, model.stepper_tmp.fC2

RAND = rand(Nx, Ny, Nz);
RAND .= RAND .- mean(RAND);
RHS_orig.data .= RAND;

RHS.data .= RHS_orig.data;

In [381]:
RHS_gpu, RHS_orig_gpu, ϕ_gpu = model_gpu.stepper_tmp.fCC1, model_gpu.stepper_tmp.fC1, model_gpu.stepper_tmp.fCC2
pNHS_gpu, ∇²p_gpu = model_gpu.pressures.pNHS, model_gpu.stepper_tmp.fC2

RHS_orig_gpu.data .= cu(RAND);
RHS_gpu.data .= RHS_orig_gpu.data;

In [382]:
@test abs(mean(RAND)) < 1e-15
@test RHS.data ≈ RHS_gpu.data

[32m[1mTest Passed[22m[39m

In [383]:
# solve_poisson_3d_ppn!(model.grid, RHS, ϕ)
# solve_poisson_3d_ppn_gpu!(Tx, Ty, Bx, By, Bz, model_gpu.grid, RHS_gpu, ϕ_gpu, kx², ky², kz²)

In [384]:
g, g_gpu = model.grid, model_gpu.grid
f, f_gpu = RHS, RHS_gpu;

In [385]:
kx² = zeros(g.Nx, 1)
ky² = zeros(g.Ny, 1)
kz² = zeros(g.Nz, 1)

for i in 1:g.Nx; kx²[i] = (2sin((i-1)*π/g.Nx)    / (g.Lx/g.Nx))^2; end
for j in 1:g.Ny; ky²[j] = (2sin((j-1)*π/g.Ny)    / (g.Ly/g.Ny))^2; end
for k in 1:g.Nz; kz²[k] = (2sin((k-1)*π/(2g.Nz)) / (g.Lz/g.Nz))^2; end

kx²_gpu = cu(zeros(g.Nx, 1))
ky²_gpu = cu(zeros(g.Ny, 1))
kz²_gpu = cu(zeros(g.Nz, 1))

for i in 1:g.Nx; kx²_gpu[i] = (2sin((i-1)*π/g.Nx)    / (g.Lx/g.Nx))^2; end
for j in 1:g.Ny; ky²_gpu[j] = (2sin((j-1)*π/g.Ny)    / (g.Ly/g.Ny))^2; end
for k in 1:g.Nz; kz²_gpu[k] = (2sin((k-1)*π/(2g.Nz)) / (g.Lz/g.Nz))^2; end

@test kx² ≈ kx²_gpu
@test ky² ≈ ky²_gpu
@test kz² ≈ kz²_gpu

[32m[1mTest Passed[22m[39m

In [386]:
FFTW.r2r!(f.data, FFTW.REDFT10, 3)
FFTW.fft!(f.data, [1, 2])

dct_dim3_gpu!(f_gpu.data)
@. f_gpu.data = real(f_gpu.data)
fft!(f_gpu.data, [1, 2])

f_dis = sum(.!(f.data .≈ Array(f_gpu.data))); println("f  disagreement: $f_dis/$(Nx*Ny*Nz)");
# @test f.data ≈ f_gpu.data

###

for k in 1:g.Nz, j in 1:g.Ny, i in 1:g.Nx
    @inbounds ϕ.data[i, j, k] = -f.data[i, j, k] / (kx²[i] + ky²[j] + kz²[k])
end
ϕ.data[1, 1, 1] = 0

@cuda threads=(Tx, Ty) blocks=(Bx, By, Bz) f2ϕ!(Val(:GPU), g_gpu.Nx, g_gpu.Ny, g_gpu.Nz, f_gpu.data, ϕ_gpu.data, kx²_gpu, ky²_gpu, kz²_gpu)
ϕ_gpu.data[1, 1, 1] = 0

ϕh_dis = sum(.!(ϕ.data .≈ Array(ϕ_gpu.data))); println("ϕh disagreement: $ϕh_dis/$(Nx*Ny*Nz)");

###

FFTW.ifft!(ϕ.data, [1, 2])
FFTW.r2r!(ϕ.data, FFTW.REDFT01, 3)
@. ϕ.data = real(ϕ.data) / (2g.Nz)

# @. ϕ_gpu.data = real(ϕ.data) / (2g.Nz)
# idct_dim3_gpu!(ϕ_gpu.data)
# ifft!(ϕ_gpu.data, [1, 2])

ifft!(ϕ_gpu.data, [1, 2])
bfactors = 0.5 * exp.(collect(1im*π*(0:Nz-1) / (2*Nz)))
ϕ_gpu.data .*= cu(repeat(reshape(bfactors, 1, 1, Nz), Nx, Ny, 1))
ifft!(ϕ_gpu.data, 3)
ϕ_gpu.data .= cu(reshape(permutedims(cat(f[:, :, 1:Int(Nz/2)], f[:, :, end:-1:Int(Nz/2)+1]; dims=4), (1, 2, 4, 3)), Nx, Ny, Nz))

ϕ_dis = sum(.!(ϕ.data .≈ Array(ϕ_gpu.data))); println("ϕ  disagreement: $ϕ_dis/$(Nx*Ny*Nz)");

###

@. pNHS.data = real(ϕ.data)
@. pNHS_gpu.data = real(ϕ_gpu.data)
p_dis = sum(.!(pNHS.data .≈ Array(pNHS_gpu.data))); println("p  disagreement: $p_dis/$(Nx*Ny*Nz)");

f  disagreement: 1/4096
ϕh disagreement: 0/4096
ϕ  disagreement: 4096/4096
p  disagreement: 4096/4096


In [387]:
ϕ.data[:, :, 5]

32×16 Array{Complex{Float64},2}:
 -3.33747+0.0im    -5.314+0.0im  …   -1.19503+0.0im   -1.81641+0.0im
 -3.07531+0.0im  -4.03877+0.0im      -0.94983+0.0im   -1.14081+0.0im
 -1.68519+0.0im  -4.18852+0.0im        2.1129+0.0im  -0.366628+0.0im
 -4.54051+0.0im  -1.82712+0.0im       1.71175+0.0im   -3.05093+0.0im
 -6.54036+0.0im  -1.50831+0.0im      -1.15034+0.0im    -3.6687+0.0im
 -7.90298+0.0im   -2.3972+0.0im  …  -0.333339+0.0im   -5.82188+0.0im
 -8.46405+0.0im  -6.53487+0.0im      0.276314+0.0im   -5.08692+0.0im
 -8.48953+0.0im  -6.71646+0.0im      -2.10957+0.0im   -4.21944+0.0im
 -8.12344+0.0im  -6.42927+0.0im       -1.8611+0.0im   -1.92849+0.0im
 -6.21662+0.0im   -6.4009+0.0im      0.440024+0.0im   -2.37709+0.0im
 -4.71794+0.0im  -6.94612+0.0im  …    2.98114+0.0im  -0.546699+0.0im
 -4.76577+0.0im  -8.92432+0.0im       3.10924+0.0im   0.290587+0.0im
 -7.70577+0.0im  -8.84273+0.0im       4.41588+0.0im   0.749933+0.0im
         ⋮                       ⋱                            ⋮     
 

In [388]:
ϕ_gpu.data[:, :, 5]

32×16 CuArray{Complex{Float32},2}:
 -34.4832+0.0im        11.605+19.1868im   …    11.605-19.1868im 
 -11.5234-18.6056im   19.0162-14.7502im        2.7744-3.06063im 
  10.3338+22.3916im  -12.2782+15.4705im       -2.0528-29.9051im 
  36.1655+2.07448im   2.22327-32.3991im      -39.7016-15.9192im 
  13.1356-14.6048im  -19.0754-4.5707im        4.54202+10.7856im 
 -4.57062+8.90372im   4.43946-35.0775im   …    4.6211+9.53563im 
 0.630392+27.4995im  -41.3118+5.42531im      -1.08787+23.2379im 
  9.58397-20.4266im  0.761502-8.2451im        -26.516+7.86889im 
  4.29618-12.8831im   -27.081-4.35466im      -15.6922-2.65586im 
 -24.8636+12.9763im   18.2457-36.1247im      -6.46319-11.4118im 
  3.04924-2.67134im   46.3323+15.535im    …  -22.5783-6.88272im 
   5.5926+1.10012im   6.95601-0.884611im     -4.48355+25.6348im 
 -28.4319-14.4183im   33.9657+38.4658im      -13.4018+16.8195im 
         ⋮                                ⋱          ⋮          
 -28.4319+14.4183im  -13.4018-16.8195im   …   33.9657-3

In [331]:
@. pNHS.data = real(ϕ.data);
∇²_ppn!(model.grid, pNHS, ∇²p)

In [332]:
∇²p.data ≈ RHS_orig.data

true

In [None]:
# pNHS_gpu_array = zeros(Float32, Nx, Ny, Nz);
@. pNHS.data = real(ϕ_gpu.data);

In [None]:
∇²_ppn!(model.grid, pNHS, ∇²p)

In [None]:
maximum(∇²p), maximum(RHS_orig_gpu.data)