In [6]:
import FFTW
using BenchmarkTools

In [7]:
using CUDAdrv, CUDAnative, CuArrays

In [8]:
Nx, Ny, Nz = 128, 128, 128
xc = rand(Float32, Nx, Ny, Nz);
xg = cu(rand(Float32, Nx, Ny, Nz));

In [42]:
@benchmark sin.(xc)

BenchmarkTools.Trial: 
  memory estimate:  8.00 MiB
  allocs estimate:  4
  --------------
  minimum time:     14.950 ms (0.00% GC)
  median time:      15.167 ms (0.00% GC)
  mean time:        15.741 ms (3.74% GC)
  maximum time:     99.762 ms (83.66% GC)
  --------------
  samples:          318
  evals/sample:     1

In [44]:
@benchmark sin.(xg)

BenchmarkTools.Trial: 
  memory estimate:  1.64 KiB
  allocs estimate:  46
  --------------
  minimum time:     10.057 μs (0.00% GC)
  median time:      11.664 μs (0.00% GC)
  mean time:        61.537 μs (69.44% GC)
  maximum time:     102.846 ms (82.79% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [69]:
function sqrt_cpu!(a)
    @. a = sqrt(a)
    nothing
end

sqrt_cpu! (generic function with 1 method)

In [70]:
function sqrt_gpu!(a)
    @. a = CUDAnative.sqrt(a)
    nothing
end

sqrt_gpu! (generic function with 1 method)

In [71]:
xc = 1 .+ rand(Float32, Nx, Ny, Nz);
xg = 1 .+ cu(rand(Float32, Nx, Ny, Nz));

In [74]:
@benchmark sqrt_cpu!(xc)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.586 ms (0.00% GC)
  median time:      2.604 ms (0.00% GC)
  mean time:        2.607 ms (0.00% GC)
  maximum time:     3.379 ms (0.00% GC)
  --------------
  samples:          1916
  evals/sample:     1

In [75]:
@benchmark sqrt_cpu!(xg)

BenchmarkTools.Trial: 
  memory estimate:  1.14 KiB
  allocs estimate:  22
  --------------
  minimum time:     5.741 μs (0.00% GC)
  median time:      35.424 μs (0.00% GC)
  mean time:        34.681 μs (0.41% GC)
  maximum time:     733.781 μs (96.30% GC)
  --------------
  samples:          10000
  evals/sample:     5

In [76]:
@benchmark sqrt_gpu!(xg)

BenchmarkTools.Trial: 
  memory estimate:  1.14 KiB
  allocs estimate:  22
  --------------
  minimum time:     5.643 μs (0.00% GC)
  median time:      35.437 μs (0.00% GC)
  mean time:        34.678 μs (0.37% GC)
  maximum time:     716.405 μs (96.22% GC)
  --------------
  samples:          10000
  evals/sample:     5

In [91]:
function expensive!(a)
    @. a = exp(sin(sqrt(a)^a)^cos(a))^(-a)
    nothing
end

expensive! (generic function with 1 method)

In [6]:
function expensive_gpu!(a)
    @. a = CUDAnative.pow(exp(CUDAnative.pow(sin(CUDAnative.pow(sqrt(a), a)), cos(a))), -a)
    nothing
end

expensive_gpu! (generic function with 1 method)

In [85]:
xc = rand(Float32, Nx, Ny, Nz);
xg = cu(rand(Float32, Nx, Ny, Nz));

In [104]:
@benchmark expensive!(xc)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     480.248 ms (0.00% GC)
  median time:      481.230 ms (0.00% GC)
  mean time:        482.565 ms (0.00% GC)
  maximum time:     493.081 ms (0.00% GC)
  --------------
  samples:          11
  evals/sample:     1

In [7]:
@benchmark expensive_gpu!($xg)

BenchmarkTools.Trial: 
  memory estimate:  5.19 KiB
  allocs estimate:  56
  --------------
  minimum time:     7.729 μs (0.00% GC)
  median time:      113.353 μs (0.00% GC)
  mean time:        109.462 μs (2.91% GC)
  maximum time:     23.472 ms (99.86% GC)
  --------------
  samples:          10000
  evals/sample:     3

In [1]:
481e-3 / 113e-6

4256.637168141593

In [42]:
function super_expensive!(a)
    @. a = (exp(sin(sqrt(a)^a)^cos(a))^(-a))^(-π)
    nothing
end

super_expensive! (generic function with 1 method)

In [16]:
function super_expensive_gpu!(a)
    @. a = exp(sqrt(sqrt(abs(cos(sin(CUDAnative.pow(CUDAnative.pow(CUDAnative.pow(exp(CUDAnative.pow(sin(CUDAnative.pow(sqrt(a), a)), cos(a))), -a), π), -π)))))))
    nothing
end

super_expensive_gpu! (generic function with 1 method)

In [43]:
xc = rand(Float32, Nx, Ny, Nz);
xg = cu(rand(Float32, Nx, Ny, Nz));

In [44]:
@benchmark super_expensive!(xc)

DomainError: DomainError with -3.141592653589793:
Exponentiation yielding a complex result requires a complex argument.
Replace x^y with (x+0im)^y, Complex(x)^y, or similar.