# Bechmarking Custom Kernels

## Setup

In [1]:
using CUDA, Random
using Test, BenchmarkTools

In [2]:
include("perm_test.jl")
include("partition.jl")

partition (generic function with 1 method)

In [3]:
Threads.nthreads()

8

In [52]:
CUDA.memory_status()

Effective GPU memory usage: 39.93% (3.169 GiB/7.936 GiB)
No memory pool is in use.

---

## Initialize Data

In [9]:
N = 2^15                       # num. samples
T = 256                        # num. threads
B = ceil(Int64, N / nthreads)  # num. blocks
nx, ny = 12, 8                 # sample sizes for each group
x, y = CUDA.rand(Float64, N, nx), CUDA.rand(Float64, N, ny)

([0.39015901569917794 0.7847194345068826 … 0.20928740672719975 0.9220333236950535; 0.16428808275350754 0.7056426236607227 … 0.6910468494711057 0.11410357680405231; … ; 0.6185908187952656 0.8206356303474067 … 0.4199970199504272 0.7386790938706924; 0.47547065097259417 0.26788680501761214 … 0.3755500819595143 0.6001599265404269], [0.5027007481726635 0.020998168712858367 … 0.00425394473947821 0.9529314091650041; 0.2140818046135799 0.1042998694251655 … 0.4045272179679979 0.8507021117824134; … ; 0.07906185207522681 0.6921827628559143 … 0.958950005841269 0.510196219231605; 0.695025035001501 0.08335582501523614 … 0.0004320606559524154 0.6036066795632946])

In [10]:
px, py = partition(nx, ny)
px, py = CuArray(px), CuArray(py)

([1 2 … 11 12; 1 2 … 11 13; … ; 8 10 … 19 20; 9 10 … 19 20], [20 19 … 14 13; 20 19 … 14 12; … ; 9 7 … 2 1; 8 7 … 2 1])

In [40]:
CUDA.allowscalar(false)
x[1]   # scalar indexing

LoadError: Scalar indexing is disallowed.
Invocation of getindex resulted in scalar indexing of a GPU array.
This is typically caused by calling an iterating implementation of a method.
Such implementations *do not* execute on the GPU, but very slowly on the CPU,
and therefore are only permitted from the REPL for prototyping purposes.
If you did intend to index this array, annotate the caller with @allowscalar.

In [41]:
x[px]  # matrix (non-scalar) indexing

125970×12 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 0.390159  0.164288   0.048688   0.277681  …  0.0676202  0.253002   0.762648
 0.390159  0.164288   0.048688   0.277681     0.0676202  0.253002   0.17261
 0.390159  0.164288   0.048688   0.277681     0.0676202  0.253002   0.742003
 0.390159  0.164288   0.048688   0.277681     0.0676202  0.253002   0.118479
 0.390159  0.164288   0.048688   0.277681     0.0676202  0.253002   0.385812
 0.390159  0.164288   0.048688   0.277681  …  0.0676202  0.253002   0.476971
 0.390159  0.164288   0.048688   0.277681     0.0676202  0.253002   0.766175
 0.390159  0.164288   0.048688   0.277681     0.0676202  0.253002   0.0718647
 0.390159  0.164288   0.048688   0.277681     0.0676202  0.253002   0.904703
 0.390159  0.164288   0.048688   0.277681     0.0676202  0.762648   0.17261
 0.390159  0.164288   0.048688   0.277681  …  0.0676202  0.762648   0.742003
 0.390159  0.164288   0.048688   0.277681     0.0676202  0.762648   0.118479
 0.390159  0.164288   0

---

## Kernels

### Mean & Variance

In [8]:
function mean!(x, n, out)
    """out = sum(x, dims=2)"""
    row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    for i = 1:n
        @inbounds out[row_idx] += x[row_idx, i]
    end
    @inbounds out[row_idx] /= n
    return
end

mean! (generic function with 1 method)

In [14]:
out = CUDA.zeros(Float64, N)
@cuda threads=T blocks=B mean!(x, size(x)[2], out)
@test isapprox(out, mean(x, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(out, mean(x, dims = 2))
   Evaluated: isapprox([0.42276973218318964, 0.42767573385618834, 0.4831524712592485, 0.48611237979911487, 0.4395516192435302, 0.49092161596331424, 0.4851448484475114, 0.5351914380198286, 0.3874865059055883, 0.44487627001643726  …  0.43060517340148524, 0.34602936197057826, 0.6835004180168199, 0.49907858337740735, 0.499260425583421, 0.6306563601147931, 0.49835490460993603, 0.4185987769980128, 0.5069556192809072, 0.4936965027742839], [0.42276973218318964; 0.42767573385618834; … ; 0.5069556192809072; 0.4936965027742839;;])

In [15]:
function sumsq!(x, n, out)
    """out = sum(x, dims=2)"""
    row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    for i = 1:n
        @inbounds out[row_idx] += x[row_idx, i]^2
    end
    return
end

sumsq! (generic function with 1 method)

In [16]:
out = CUDA.zeros(Float64, N)
@cuda threads=T blocks=B sumsq!(x, size(x)[2], out)
@test isapprox(out, sum(x.^2, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(out, sum(x .^ 2, dims = 2))
   Evaluated: isapprox([2.9531018774148086, 3.1790675748369743, 3.9656508444115333, 3.800037022907106, 3.356273550543609, 4.16561891875466, 4.0703096558270575, 4.111705031998799, 2.866273597668395, 3.415905663621952  …  3.142739218575019, 2.6207189658821237, 6.347311199904543, 3.6258463421240386, 3.8745421141461165, 5.555892979579336, 4.239972555346566, 2.9453676018286834, 3.572019126119588, 3.5529036727134033], [2.9531018774148086; 3.1790675748369743; … ; 3.5720191261195877; 3.552903672713403;;])

In [20]:
function _var!(n, ss, means, out)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds out[i] = (ss[i] - (n * means[i]^2)) / (n-1)
    return
end

function var_gpu(x)
    nrow, ncol = size(x)
    ss = CUDA.zeros(Float64, size(x)[1])
    @cuda threads=T blocks=B sumsq!(x, ncol, ss)

    means = CUDA.zeros(Float64, nrow)
    @cuda threads=T blocks=B mean!(x, ncol, means)

    vars = CUDA.zeros(Float64, nrow)
    @cuda threads=T blocks=B _var!(ncol, ss, means, vars)
    return vars, means
end

var_gpu (generic function with 1 method)

In [21]:
@test isapprox(var_gpu(x)[1], var(x, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox((var_gpu(x))[1], var(x, dims = 2))
   Evaluated: isapprox([0.07348099272835072, 0.08947174317125667, 0.1058559198731369, 0.0876703703072363, 0.0943460035262088, 0.11577913841038899, 0.11326576073865019, 0.06132241164018315, 0.09677491732375788, 0.09462971964882692  …  0.08342631220486738, 0.10762573943026708, 0.06738521297768471, 0.0578993775901856, 0.08031004031829571, 0.07119669499458733, 0.11451647490556587, 0.07660621532526354, 0.04436101155215741, 0.0570971664086092], [0.07348099272835071; 0.08947174317125672; … ; 0.04436101155215735; 0.05709716640860912;;])

### t Test Statistic

In [22]:
function mul_gpu!(x, val)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] *= val
    return
end

function mul_arr_gpu!(x, y)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] *= y[i]
    return
end

function div_gpu!(x, val)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] /= val
    return
end

function div_arr_gpu!(x, y)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] /= y[i]
    return
end

function sqrt_gpu!(x)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] = sqrt(x[i])
    return
end

function add_gpu!(a, b, c)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds c[i] = a[i] + b[i]
    return
end

function subtract_gpu!(a, b, c)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds c[i] = a[i] - b[i]
    return
end

function copy_arr_gpu!(dest, source)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds dest[i] = source[i]
    return
end

copy_arr_gpu! (generic function with 1 method)

In [26]:
function t_gpu(x, y; pooled=false)
    varx, meanx = var_gpu(x)
    vary, meany = var_gpu(y)
    nx, ny = size(x)[2], size(y)[2]
    
    if pooled
        @cuda threads=T blocks=B mul_gpu!(varx, nx-1)
        @cuda threads=T blocks=B mul_gpu!(vary, ny-1)
        @cuda threads=T blocks=B add_gpu!(varx, vary, varx)   # varx = (nx-1)*varx + (ny-1)*vary
        @cuda threads=T blocks=B div_gpu!(varx, nx+ny-2)      # varx /= nx+ny-2
        #@cuda threads=256 blocks=4 copy_arr_gpu!(vary, varx)
        copyto!(vary, varx)
    end
        
    @cuda threads=T blocks=B div_gpu!(varx, nx)
    @cuda threads=T blocks=B div_gpu!(vary, ny)
    
    denom = CUDA.zeros(Float64, size(x)[1])
    @cuda threads=T blocks=B add_gpu!(varx, vary, denom)
    @cuda threads=T blocks=B sqrt_gpu!(denom)
    
    t = CUDA.zeros(Float64, size(x)[1])
    @cuda threads=T blocks=B subtract_gpu!(meanx, meany, t)
    @cuda threads=T blocks=B div_arr_gpu!(t, denom)
    return t
end

t_gpu (generic function with 1 method)

In [27]:
pooled = false
@test isapprox(t_gpu(x, y, pooled=pooled), ttest_ind(x, y, pooled))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(t_gpu(x, y, pooled = pooled), ttest_ind(x, y, pooled))
   Evaluated: isapprox([-0.23448901664480554, 0.05901404020274747, 0.38193147235097874, -1.1025761619685288, -0.37842271181044485, -0.5640191022990872, 0.9204858513562348, -0.5540337831883215, -2.022682039847498, -0.30371442721737474  …  -0.7057823538090785, -0.6766744521720237, 2.142521777498816, -1.7249230335734647, 1.084810557658178, 1.222268956858456, -0.5442251380071487, 0.11810933367877136, 0.5399342464957391, 1.2943106132842188], [-0.23448901664480554; 0.05901404020274746; … ; 0.5399342464957391; 1.2943106132842193;;])

---

## Benchmarks

### t Test Statistic: `@cuda` vs. `@.` vectorized

In [51]:
@btime t_gpu($x, $y, pooled=$pooled)
@btime ttest_ind($x, $y, $pooled)

  248.894 μs (428 allocations: 22.92 KiB)
  706.026 μs (340 allocations: 19.33 KiB)


32768×1 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 -0.23448901664480554
  0.05901404020274746
  0.38193147235097874
 -1.1025761619685284
 -0.37842271181044496
 -0.5640191022990872
  0.9204858513562348
 -0.5540337831883215
 -2.022682039847497
 -0.30371442721737457
  1.5446552677841405
 -0.3557875158409345
 -1.23487851962008
  ⋮
 -0.41275974205144184
 -1.993862076306121
 -0.7057823538090785
 -0.6766744521720237
  2.142521777498816
 -1.7249230335734638
  1.0848105576581777
  1.2222689568584562
 -0.5442251380071487
  0.11810933367877134
  0.5399342464957391
  1.2943106132842193

### Copying

In [23]:
a = CUDA.rand(N)
b = CUDA.zeros(N)

@btime copyto!($b, $a)
@btime @cuda threads=T blocks=B copy_arr_gpu!(b, a)
@test all(a .== b)

  2.525 μs (0 allocations: 0 bytes)
  6.128 μs (24 allocations: 1.28 KiB)


[32m[1mTest Passed[22m[39m
  Expression: all(a .== b)

### Initializing `CuArray`s

In [48]:
@btime CuArray{Float64, 1}(undef, 10_000)
@btime CUDA.zeros(Float64, 10_000)
@btime CuArray(zeros(10_000))

  4.660 μs (4 allocations: 128 bytes)
  12.167 μs (22 allocations: 1.11 KiB)
  26.057 μs (7 allocations: 78.31 KiB)


10000-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0