# Bechmarking Custom Kernels

## Setup

In [16]:
using CUDA, Random
using Test, BenchmarkTools
CUDA.allowscalar(false)

In [9]:
include("perm_test.jl")
include("partition.jl")

partition (generic function with 1 method)

In [3]:
Threads.nthreads()

8

In [17]:
CUDA.memory_status()

Effective GPU memory usage: 1.89% (153.625 MiB/7.936 GiB)
No memory pool is in use.

---

## Initialize Data

In [18]:
function set_thread_block(len, nthreads=256)
    nblocks = ceil(Int, len / nthreads)
    return nthreads, nblocks
end

set_thread_block (generic function with 2 methods)

In [19]:
N = 2^15                    # num. samples
T, B = set_thread_block(N)
nx, ny = 12, 8              # sample sizes for each group
x, y = CUDA.rand(Float64, N, nx), CUDA.rand(Float64, N, ny)
px, py = partition(nx, ny)
px, py = CuArray(px), CuArray(py)

([1 2 … 11 12; 1 2 … 11 13; … ; 8 10 … 19 20; 9 10 … 19 20], [20 19 … 14 13; 20 19 … 14 12; … ; 9 7 … 2 1; 8 7 … 2 1])

---

## Kernels

### Mean & Variance

In [24]:
function mean!(x, ncol, out)
    """out = sum(x, dims=2)"""
    row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    for i = 1:ncol
        @inbounds out[row_idx] += x[row_idx, i]
    end
    @inbounds out[row_idx] /= ncol
    return
end

out = CUDA.zeros(Float64, N)
@cuda threads=T blocks=B mean!(x, size(x)[2], out)
@test isapprox(out, mean(x, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(out, mean(x, dims = 2))
   Evaluated: isapprox([0.35853716622161697, 0.3091693259738932, 0.689691935252548, 0.47212022013793603, 0.41096342341292136, 0.6515839175455919, 0.42196401231345776, 0.5022668870015077, 0.600749682796924, 0.5165848241247455  …  0.5753852698107481, 0.4772818330704583, 0.5316436073848941, 0.6056038081204794, 0.4095856354200143, 0.39356171787676003, 0.6022315024473835, 0.3286615267603409, 0.4807587984501902, 0.5358573367271325], [0.35853716622161697; 0.3091693259738932; … ; 0.4807587984501902; 0.5358573367271325;;])

In [23]:
function sumsq!(x, ncol, out)
    """out = sum(x, dims=2)"""
    row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    for i = 1:ncol
        @inbounds out[row_idx] += x[row_idx, i]^2
    end
    return
end

out = CUDA.zeros(Float64, N)
@cuda threads=T blocks=B sumsq!(x, size(x)[2], out)
@test isapprox(out, sum(x.^2, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(out, sum(x .^ 2, dims = 2))
   Evaluated: isapprox([2.4598905191141793, 2.1331097392573004, 6.457076804953674, 4.271737362977878, 2.8739343099876766, 5.96193317235129, 3.0131836235342, 4.11507195311366, 5.557911735957049, 4.77081110941095  …  4.719813463028387, 3.2924660221333895, 4.544027700111192, 5.052011870121489, 2.697714680372116, 2.7336969619443443, 5.288301260517309, 1.991310374632858, 3.5751915973356128, 4.667735857992992], [2.459890519114179; 2.1331097392573004; … ; 3.5751915973356128; 4.667735857992992;;])

In [26]:
function _var!(n, ss, means, out)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds out[i] = (ss[i] - (n * means[i]^2)) / (n-1)
    return
end

function var_gpu(x)
    nrow, ncol = size(x)
    T, B = set_thread_block(nrow)
    ss = CUDA.zeros(Float64, size(x)[1])
    @cuda threads=T blocks=B sumsq!(x, ncol, ss)

    means = CUDA.zeros(Float64, nrow)
    @cuda threads=T blocks=B mean!(x, ncol, means)

    vars = CUDA.zeros(Float64, nrow)
    @cuda threads=T blocks=B _var!(ncol, ss, means, vars)
    return vars, means
end

z = CUDA.rand(Float64, 300, 20)  # works with arbitrary matrix (i.e., dimensions don't have to be multiple of 256)
@test isapprox(var_gpu(z)[1], var(z, dims=2))

var_gpu (generic function with 1 method)

### t Test Statistic

In [30]:
function mul_gpu!(x, val)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] *= val
    return
end

function mul_arr_gpu!(x, y)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] *= y[i]
    return
end

function div_gpu!(x, val)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] /= val
    return
end

function div_arr_gpu!(x, y)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] /= y[i]
    return
end

function sqrt_gpu!(x)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] = sqrt(x[i])
    return
end

function add_gpu!(a, b, c)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds c[i] = a[i] + b[i]
    return
end

function subtract_gpu!(a, b, c)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds c[i] = a[i] - b[i]
    return
end

function copy_arr_gpu!(dest, source)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds dest[i] = source[i]
    return
end

copy_arr_gpu! (generic function with 1 method)

In [32]:
function t_gpu(x, y, pooled)
    varx, meanx = var_gpu(x)
    vary, meany = var_gpu(y)
    nx, ny = size(x, 2), size(y, 2)
    T, B = set_thread_block(size(x,1))
    
    if pooled
        @cuda threads=T blocks=B mul_gpu!(varx, nx-1)
        @cuda threads=T blocks=B mul_gpu!(vary, ny-1)
        @cuda threads=T blocks=B add_gpu!(varx, vary, varx)   # varx = (nx-1)*varx + (ny-1)*vary
        @cuda threads=T blocks=B div_gpu!(varx, nx+ny-2)      # varx /= nx+ny-2
        #@cuda threads=256 blocks=4 copy_arr_gpu!(vary, varx)
        copyto!(vary, varx)
    end
        
    @cuda threads=T blocks=B div_gpu!(varx, nx)
    @cuda threads=T blocks=B div_gpu!(vary, ny)
    
    denom = CUDA.zeros(Float64, size(x)[1])
    @cuda threads=T blocks=B add_gpu!(varx, vary, denom)
    @cuda threads=T blocks=B sqrt_gpu!(denom)
    
    t = CUDA.zeros(Float64, size(x)[1])
    @cuda threads=T blocks=B subtract_gpu!(meanx, meany, t)
    @cuda threads=T blocks=B div_arr_gpu!(t, denom)
    return t
end

t_gpu (generic function with 2 methods)

In [35]:
pooled = false
start, stop = 1, 10
a, b = x[start:stop, :], y[start:stop, :]
@test isapprox(
    t_gpu(a, b, pooled),
    ttest_ind(a, b, pooled)
)

[32m[1mTest Passed[22m[39m
  Expression: isapprox(t_gpu(x[1:10, :], y[1:10, :], pooled), ttest_ind(x[1:10, :], y[1:10, :], pooled))
   Evaluated: isapprox([-0.6084672400601868, -1.3656430152327157, 1.3233596834837367, -0.808655969482445, -0.919506217068944, 0.48850609480561896, 0.22310357447735862, 0.35679933928850255, 1.2260926596041675, 0.8993202025114901], [-0.6084672400601863; -1.3656430152327155; … ; 1.2260926596041692; 0.8993202025114894;;])

---

## Benchmarks

### t Test Statistic: `@cuda` vs. `@.` vectorized

In [51]:
@btime t_gpu($x, $y, pooled=$pooled)
@btime ttest_ind($x, $y, $pooled)

  248.894 μs (428 allocations: 22.92 KiB)
  706.026 μs (340 allocations: 19.33 KiB)


32768×1 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 -0.23448901664480554
  0.05901404020274746
  0.38193147235097874
 -1.1025761619685284
 -0.37842271181044496
 -0.5640191022990872
  0.9204858513562348
 -0.5540337831883215
 -2.022682039847497
 -0.30371442721737457
  1.5446552677841405
 -0.3557875158409345
 -1.23487851962008
  ⋮
 -0.41275974205144184
 -1.993862076306121
 -0.7057823538090785
 -0.6766744521720237
  2.142521777498816
 -1.7249230335734638
  1.0848105576581777
  1.2222689568584562
 -0.5442251380071487
  0.11810933367877134
  0.5399342464957391
  1.2943106132842193

### Copying

In [23]:
a = CUDA.rand(N)
b = CUDA.zeros(N)

@btime copyto!($b, $a)
@btime @cuda threads=T blocks=B copy_arr_gpu!(b, a)
@test all(a .== b)

  2.525 μs (0 allocations: 0 bytes)
  6.128 μs (24 allocations: 1.28 KiB)


[32m[1mTest Passed[22m[39m
  Expression: all(a .== b)

### Initializing `CuArray`s

In [48]:
@btime CuArray{Float64, 1}(undef, 10_000)
@btime CUDA.zeros(Float64, 10_000)
@btime CuArray(zeros(10_000))

  4.660 μs (4 allocations: 128 bytes)
  12.167 μs (22 allocations: 1.11 KiB)
  26.057 μs (7 allocations: 78.31 KiB)


10000-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0