# Bechmarking Custom Kernels

## Setup

In [1]:
using CUDA, Random
using Test, BenchmarkTools
CUDA.allowscalar(false)

In [2]:
include("perm_test.jl")
include("partition.jl")

partition (generic function with 1 method)

In [3]:
Threads.nthreads()

8

In [5]:
CUDA.memory_status()

Effective GPU memory usage: 1.10% (89.500 MiB/7.936 GiB)
No memory pool is in use.

---

## Initialize Data

In [3]:
function set_thread_block(len, nthreads=256)
    nblocks = ceil(Int, len / nthreads)
    return nthreads, nblocks
end

set_thread_block (generic function with 2 methods)

In [4]:
N = 2^15                    # num. samples
T, B = set_thread_block(N)
nx, ny = 12, 8              # sample sizes for each group
x, y = CUDA.rand(Float64, N, nx), CUDA.rand(Float64, N, ny)
px, py = partition(nx, ny)
px, py = CuArray(px), CuArray(py)

([1 2 … 11 12; 1 2 … 11 13; … ; 8 10 … 19 20; 9 10 … 19 20], [20 19 … 14 13; 20 19 … 14 12; … ; 9 7 … 2 1; 8 7 … 2 1])

---

## Kernels

### Mean & Variance

In [6]:
function mean!(x, ncol, out)
    """out = sum(x, dims=2)"""
    row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    for i = 1:ncol
        @inbounds out[row_idx] += x[row_idx, i]
    end
    @inbounds out[row_idx] /= ncol
    return
end

out = CUDA.zeros(Float64, N)
@cuda threads=T blocks=B mean!(x, size(x)[2], out)
@test isapprox(out, mean(x, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(out, mean(x, dims = 2))
   Evaluated: isapprox([0.4672852542253805, 0.551041753452703, 0.5985412991637825, 0.4226288275419107, 0.5317097044519004, 0.4342811713741532, 0.6866815818336852, 0.4392179624362464, 0.34722126058832464, 0.5840441773507449  …  0.3457239181502297, 0.5337717200348154, 0.38929762460815415, 0.5436974949083385, 0.4109756444706507, 0.3974622649104835, 0.6568111480839699, 0.5282659231047768, 0.5078012153777888, 0.4573249937726236], [0.4672852542253805; 0.551041753452703; … ; 0.5078012153777888; 0.4573249937726236;;])

In [7]:
function sumsq!(x, ncol, out)
    """out = sum(x, dims=2)"""
    row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    for i = 1:ncol
        @inbounds out[row_idx] += x[row_idx, i]^2
    end
    return
end

out = CUDA.zeros(Float64, N)
@cuda threads=T blocks=B sumsq!(x, size(x)[2], out)
@test isapprox(out, sum(x.^2, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(out, sum(x .^ 2, dims = 2))
   Evaluated: isapprox([3.693223999791831, 4.978098740272386, 5.285090633465086, 3.440420883931469, 4.089495444177012, 3.2780957378038167, 6.466351815623792, 2.881706696619888, 2.5407194751662336, 4.561802191965835  …  2.331504998496828, 4.152356548441717, 2.8892253011640694, 4.535481878976794, 2.8936879257120185, 2.931741671330754, 5.814172274833773, 4.1577661411085405, 3.9571249475068866, 3.6563412779424125], [3.6932239997918304; 4.978098740272386; … ; 3.9571249475068866; 3.6563412779424125;;])

In [8]:
function _var!(n, ss, means, out)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds out[i] = (ss[i] - (n * means[i]^2)) / (n-1)
    return
end

function var_gpu(x)
    nrow, ncol = size(x)
    T, B = set_thread_block(nrow)
    ss = CUDA.zeros(Float64, size(x)[1])
    @cuda threads=T blocks=B sumsq!(x, ncol, ss)

    means = CUDA.zeros(Float64, nrow)
    @cuda threads=T blocks=B mean!(x, ncol, means)

    vars = CUDA.zeros(Float64, nrow)
    @cuda threads=T blocks=B _var!(ncol, ss, means, vars)
    return vars, means
end

z = CUDA.rand(Float64, 300, 20)  # works with arbitrary matrix size
@test isapprox(var_gpu(z)[1], var(z, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox((var_gpu(z))[1], var(z, dims = 2))
   Evaluated: isapprox([0.08429197235765579, 0.0744182583300651, 0.08567219715305538, 0.05947543373856663, 0.11287290751278621, 0.06848336639028041, 0.0880558708589525, 0.09886973090462484, 0.09423904555083636, 0.0861626352703772  …  0.06380125280728965, 0.06363696274728829, 0.07324037579453938, 0.06096120654117144, 0.06636931532131247, 0.10343917966082875, 0.09541847889025085, 0.0774501037223696, 0.09305758436870598, 0.06853871531191043], [0.0842919723576558; 0.07441825833006517; … ; 0.09305758436870609; 0.06853871531191041;;])

### t Test Statistic

In [11]:
function mul_gpu!(x, val)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] *= val
    return
end

function mul_arr_gpu!(x, y)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] *= y[i]
    return
end

function div_gpu!(x, val)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] /= val
    return
end

function div_arr_gpu!(x, y)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] /= y[i]
    return
end

function add_gpu!(x, val)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] += val
    return
end

function add_arr_gpu!(x, y)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] += y[i]
    return
end

function sub_gpu!(a, b, c)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] -= val
    return
end

function sub_arr_gpu!(x, y)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] -= y[i]
    return
end

function sqrt_gpu!(x)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] = sqrt(x[i])
    return
end

function copy_arr_gpu!(dest, source)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds dest[i] = source[i]
    return
end

copy_arr_gpu! (generic function with 1 method)

In [12]:
function t_gpu(x, y, pooled)
    varx, meanx = var_gpu(x)
    vary, meany = var_gpu(y)
    nx, ny = size(x, 2), size(y, 2)
    T, B = set_thread_block(size(x,1))
    
    if pooled
        @cuda threads=T blocks=B mul_gpu!(varx, nx-1)
        @cuda threads=T blocks=B mul_gpu!(vary, ny-1)
        @cuda threads=T blocks=B add_arr_gpu!(varx, vary)     # varx = (nx-1)*varx + (ny-1)*vary
        @cuda threads=T blocks=B div_gpu!(varx, nx+ny-2)      # varx /= nx+ny-2
        #@cuda threads=256 blocks=4 copy_arr_gpu!(vary, varx)
        copyto!(vary, varx)
    end
        
    @cuda threads=T blocks=B div_gpu!(varx, nx)
    @cuda threads=T blocks=B div_gpu!(vary, ny)
    @cuda threads=T blocks=B add_arr_gpu!(varx, vary)  # varx .+= vary
    @cuda threads=T blocks=B sqrt_gpu!(varx)
    #return varx
    @cuda threads=T blocks=B sub_arr_gpu!(meanx, meany)
    #return meanx
    @cuda threads=T blocks=B div_arr_gpu!(meanx, varx)
    return meanx
end

a = CUDA.rand(Float64, 200, 200)
b = CUDA.rand(Float64, 200, 200)
pooled = false
@test isapprox(
    t_gpu(a, b, pooled),
    ttest_ind(a, b, pooled)
)

[91m[1mError During Test[22m[39m at [39m[1mIn[12]:30[22m
  Test threw exception
  Expression: isapprox(t_gpu(a, b, pooled), ttest_ind(a, b, pooled))
  MethodError: no method matching div_arr_gpu!(::CuDeviceVector{Float64, 1}, ::CuDeviceVector{Float64, 1})
  [0mClosest candidates are:
  [0m  div_arr_gpu!(::Any, ::Any) at In[11]:22
  [0m  div_arr_gpu!(::Any, ::Any, [91m::Any[39m) at In[9]:22
  Stacktrace:
    [1] [0m[1mmacro expansion[22m
  [90m    @ [39m[90m~/.julia/packages/GPUCompiler/HeCT6/src/[39m[90m[4mcache.jl:0[24m[39m[90m [inlined][39m
    [2] [0m[1mspecialization_id[22m[0m[1m([22m[90mjob[39m::[0mGPUCompiler.CompilerJob[90m{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{typeof(div_arr_gpu!), Tuple{CuDeviceVector{Float64, 1}, CuDeviceVector{Float64, 1}}}}[39m[0m[1m)[22m
  [90m    @ [39m[35mGPUCompiler[39m [90m~/.julia/packages/GPUCompiler/HeCT6/src/[39m[90m[4mcache.jl:12[24m[39m
    [3] [0m[1mcac

LoadError: [91mThere was an error during testing[39m

---

## Benchmarks

### t Test Statistic: `@cuda` vs. `@.` vectorized

In [51]:
@btime t_gpu($x, $y, pooled=$pooled)
@btime ttest_ind($x, $y, $pooled)

  248.894 μs (428 allocations: 22.92 KiB)
  706.026 μs (340 allocations: 19.33 KiB)


32768×1 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 -0.23448901664480554
  0.05901404020274746
  0.38193147235097874
 -1.1025761619685284
 -0.37842271181044496
 -0.5640191022990872
  0.9204858513562348
 -0.5540337831883215
 -2.022682039847497
 -0.30371442721737457
  1.5446552677841405
 -0.3557875158409345
 -1.23487851962008
  ⋮
 -0.41275974205144184
 -1.993862076306121
 -0.7057823538090785
 -0.6766744521720237
  2.142521777498816
 -1.7249230335734638
  1.0848105576581777
  1.2222689568584562
 -0.5442251380071487
  0.11810933367877134
  0.5399342464957391
  1.2943106132842193

### Copying

In [23]:
a = CUDA.rand(N)
b = CUDA.zeros(N)

@btime copyto!($b, $a)
@btime @cuda threads=T blocks=B copy_arr_gpu!(b, a)
@test all(a .== b)

  2.525 μs (0 allocations: 0 bytes)
  6.128 μs (24 allocations: 1.28 KiB)


[32m[1mTest Passed[22m[39m
  Expression: all(a .== b)

### Initializing `CuArray`s

In [48]:
@btime CuArray{Float64, 1}(undef, 10_000)
@btime CUDA.zeros(Float64, 10_000)
@btime CuArray(zeros(10_000))

  4.660 μs (4 allocations: 128 bytes)
  12.167 μs (22 allocations: 1.11 KiB)
  26.057 μs (7 allocations: 78.31 KiB)


10000-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0