# Bechmarking Custom Kernels

## Setup

In [1]:
using CUDA, Random
using Test, BenchmarkTools
CUDA.allowscalar(false)

In [41]:
function subtract!(x, val)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x 
    @inbounds x[i] -= val
    return
end

a = CUDA.ones(Float64, 256*4)
target = a .- 1
@cuda threads=256 blocks=4 subtract!(a, 1)
all(target .== a)

true

In [15]:
function add_gpu!(y, x)
    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
    y[i] += x[i]
    return
end

a = CUDA.rand(Float64, 256*4)
b = CUDA.rand(Float64, 256*4)
c = a .+ b
@cuda threads=256 blocks=4 add_gpu!(b, a)
all(b .== c)

true

In [2]:
include("perm_test.jl")
include("partition.jl")

partition (generic function with 1 method)

In [3]:
Threads.nthreads()

8

In [4]:
CUDA.memory_status()

Effective GPU memory usage: 0.73% (59.125 MiB/7.936 GiB)
No memory pool is in use.

---

## Initialize Data

In [4]:
function set_thread_block(len, nthreads=256)
    nblocks = ceil(Int, len / nthreads)
    return nthreads, nblocks
end

set_thread_block (generic function with 2 methods)

In [5]:
N = 2^15                    # num. samples
T, B = set_thread_block(N)
nx, ny = 12, 8              # sample sizes for each group
x, y = CUDA.rand(Float64, N, nx), CUDA.rand(Float64, N, ny)
px, py = partition(nx, ny)
px, py = CuArray(px), CuArray(py)

([1 2 … 11 12; 1 2 … 11 13; … ; 8 10 … 19 20; 9 10 … 19 20], [20 19 … 14 13; 20 19 … 14 12; … ; 9 7 … 2 1; 8 7 … 2 1])

---

## Kernels

### Mean & Variance

In [7]:
function mean!(x, ncol, out)
    """out = sum(x, dims=2)"""
    row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    for i = 1:ncol
        @inbounds out[row_idx] += x[row_idx, i]
    end
    @inbounds out[row_idx] /= ncol
    return
end

out = CUDA.zeros(Float64, N)
@cuda threads=T blocks=B mean!(x, size(x)[2], out)
@test isapprox(out, mean(x, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(out, mean(x, dims = 2))
   Evaluated: isapprox([0.39733968302108463, 0.46606242490765926, 0.509690131471409, 0.38030606362527636, 0.6635388999644857, 0.5335489722052404, 0.6207313270377118, 0.5519816092430713, 0.4478335769723505, 0.5470873382976155  …  0.48842900703929043, 0.5114141271222917, 0.5877468897883708, 0.42488719483401827, 0.5967271710540742, 0.448077987777508, 0.5754315354221882, 0.44142439703720476, 0.5227641299077471, 0.4134743813888784], [0.39733968302108463; 0.46606242490765926; … ; 0.5227641299077471; 0.4134743813888784;;])

In [8]:
function sumsq!(x, ncol, out)
    """out = sum(x, dims=2)"""
    row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    for i = 1:ncol
        @inbounds out[row_idx] += x[row_idx, i]^2
    end
    return
end

out = CUDA.zeros(Float64, N)
@cuda threads=T blocks=B sumsq!(x, size(x)[2], out)
@test isapprox(out, sum(x.^2, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(out, sum(x .^ 2, dims = 2))
   Evaluated: isapprox([3.3505186320850746, 3.5205090438719506, 4.599308140690546, 2.5799348098844064, 6.183259698236766, 4.028304261879531, 5.590323020933638, 4.48616806528492, 3.3989079529358355, 4.538556192110131  …  3.8212513017881085, 3.983841694657771, 4.934117957318372, 3.298106200977595, 5.387028066307541, 3.3676077919729015, 4.902036830006772, 3.254992949429894, 4.460643440418318, 3.0730376887687623], [3.3505186320850746; 3.5205090438719506; … ; 4.460643440418318; 3.0730376887687623;;])

In [9]:
function _var!(n, ss, means, out)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds out[i] = (ss[i] - (n * means[i]^2)) / (n-1)
    return
end

function var_gpu(x)
    nrow, ncol = size(x)
    T, B = set_thread_block(nrow)
    ss = CUDA.zeros(Float64, size(x)[1])
    @cuda threads=T blocks=B sumsq!(x, ncol, ss)

    means = CUDA.zeros(Float64, nrow)
    @cuda threads=T blocks=B mean!(x, ncol, means)

    vars = CUDA.zeros(Float64, nrow)
    @cuda threads=T blocks=B _var!(ncol, ss, means, vars)
    return vars, means
end

z = CUDA.rand(Float64, 300, 20)  # works with arbitrary matrix size
@test isapprox(var_gpu(z)[1], var(z, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox((var_gpu(z))[1], var(z, dims = 2))
   Evaluated: isapprox([0.0990023905050552, 0.08724602566531708, 0.09236236634725219, 0.07680292526336481, 0.11297964265478316, 0.09154778136352783, 0.07007101500217339, 0.10159567902516423, 0.08830723936878838, 0.1047530154445524  …  0.07780201021880724, 0.0831995443758919, 0.08626285371574997, 0.08978033242054828, 0.09929088849489377, 0.07658279287466459, 0.05971251101062614, 0.08081223981841738, 0.08140016915085069, 0.09124581895573898], [0.0990023905050551; 0.08724602566531728; … ; 0.08140016915085076; 0.09124581895573891;;])

### t Test Statistic

In [6]:
function mul_gpu!(x, val)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] *= val
    return
end

function mul_arr_gpu!(x, y)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] *= y[i]
    return
end

function div_gpu!(x, val)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] /= val
    return
end

function div_arr_gpu!(x, y)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] /= y[i]
    return
end

function add_gpu!(x, val)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] += val
    return
end

function add_arr_gpu!(x, y)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] += y[i]
    return
end

function sub_gpu!(x, val)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] -= val
    return
end

function sub_arr_gpu!(x, y)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] -= y[i]
    return
end

function sqrt_gpu!(x)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] = sqrt(x[i])
    return
end

function copy_arr_gpu!(dest, source)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds dest[i] = source[i]
    return
end

copy_arr_gpu! (generic function with 1 method)

In [12]:
CUDA.rand(1)

1-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
 0.42432487

In [31]:
Random.seed!(123)
a = CuArray(rand(N))
b = copy(a)
val = CUDA.rand(1)
T, B = set_thread_block(size(a,1))
@cuda threads=T blocks=B sub_gpu!(a, val)
isapprox(a, b .- val)
#@cuda threads=T blocks=B div_gpu!(a, val)
#isapprox(a, (b .- val) ./ val)

LoadError: GPU compilation of kernel sub_gpu!(CuDeviceVector{Float64, 1}, CuDeviceVector{Float32, 1}) failed
KernelError: kernel returns a value of type `Union{}`

Make sure your kernel function ends in `return`, `return nothing` or `nothing`.
If the returned value is of type `Union{}`, your Julia code probably throws an exception.
Inspect the code with `@device_code_warntype` for more details.


In [115]:
function t_gpu(x, y, pooled)
    varx, meanx = var_gpu(x)
    vary, meany = var_gpu(y)
    nx, ny = size(x, 2), size(y, 2)
    T, B = set_thread_block(size(x,1))
    return varx
end

t_gpu (generic function with 1 method)

In [119]:
all(var_gpu(x[1,:]')[1] .== var(x[1,:]))

false

In [105]:
pooled = false
@test isapprox(
    t_gpu(x, y, pooled),
    ttest_ind(x, y, pooled)
)

[32m[1mTest Passed[22m[39m
  Expression: isapprox(t_gpu(x, y, pooled), ttest_ind(x, y, pooled))
   Evaluated: isapprox([-0.17846062188608072, -0.5544929233801428, -0.053442709686935326, -1.6453844463179053, 0.7985744065403201, 0.6613679526260251, 1.4077764704394224, 0.9316901817580137, 0.976555746673132, -1.0320007794048787  …  -0.1742915694431432, 0.10326542932082894, 1.1239376782276302, -0.7318112449317216, 1.1491076499914104, -0.20567239115503766, 0.17747838757216614, -1.0966573125795651, -0.19199019010346396, -1.497745664028119], [-0.17846062188608072; -0.5544929233801424; … ; -0.19199019010346394; -1.497745664028119;;])

In [107]:
@views t_gpu(x[1:2,:], y[1:2,:], false)

2-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
 -2.7438546778295443
 -4.170154062102384

### Permutation test p-value

In [17]:
x[1,:]'

1×12 adjoint(::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}) with eltype Float64:
 0.758347  0.156238  0.00663024  0.761705  …  0.126937  0.0843308  0.776577

In [38]:
ttest_ind(x[1,:], y[1,:], pooled)

1-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
 -0.1676415669614003

In [42]:
t_gpu(x[1,:]', y[1,:]', false)

1-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
 NaN

In [31]:
function pval_gpu(x, y, px, py; pooled=false, alternative="two-sided", delta=0)
    x_shift = copy(x)
    T, B = set_thread_block(size(x,1))
    @cuda threads=T blocks=B sub_gpu!(x_shift, delta)
    t_obs = t_gpu(x_shift', y', pooled)
    return t_obs
end

pval_gpu(x[1,:], y[1,:], px, py)

1-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
 NaN

In [29]:
x[1,:]'

1×12 adjoint(::CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}) with eltype Float64:
 0.758347  0.156238  0.00663024  0.761705  …  0.126937  0.0843308  0.776577

In [None]:
    x1_shifted = x1 .- delta             # shift group 1 under null hypothesis
    t_obs = ttest_ind(x1_shifted, x2, pooled)  # test statistic for observed data
    # println(t_obs)

    combined = vcat(x1_shifted, x2)  # join original pair into single vector
    x1s = combined[parts1]   # get all combinations of pairs from original pair
    x2s = combined[parts2]
    ts = ttest_ind(x1s, x2s, pooled)   # test statistic for all possible pairs of samples

    if alternative == "smaller"
        n_extreme = count(ts .<= t_obs)
    elseif alternative == "larger"
        n_extreme = count(ts .>= t_obs)
    else
        n_extreme = count(@. (ts <= -abs(t_obs)) | (ts >= abs(t_obs)))
    end

    return n_extreme / size(parts1)[1]  # proportion of pairs w/ extreme test statistic

---

## Benchmarks

### t Test Statistic: `@cuda` vs. `@.` vectorized

In [51]:
@btime t_gpu($x, $y, pooled=$pooled)
@btime ttest_ind($x, $y, $pooled)

  248.894 μs (428 allocations: 22.92 KiB)
  706.026 μs (340 allocations: 19.33 KiB)


32768×1 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 -0.23448901664480554
  0.05901404020274746
  0.38193147235097874
 -1.1025761619685284
 -0.37842271181044496
 -0.5640191022990872
  0.9204858513562348
 -0.5540337831883215
 -2.022682039847497
 -0.30371442721737457
  1.5446552677841405
 -0.3557875158409345
 -1.23487851962008
  ⋮
 -0.41275974205144184
 -1.993862076306121
 -0.7057823538090785
 -0.6766744521720237
  2.142521777498816
 -1.7249230335734638
  1.0848105576581777
  1.2222689568584562
 -0.5442251380071487
  0.11810933367877134
  0.5399342464957391
  1.2943106132842193

### Copying

In [23]:
a = CUDA.rand(N)
b = CUDA.zeros(N)

@btime copyto!($b, $a)
@btime @cuda threads=T blocks=B copy_arr_gpu!(b, a)
@test all(a .== b)

  2.525 μs (0 allocations: 0 bytes)
  6.128 μs (24 allocations: 1.28 KiB)


[32m[1mTest Passed[22m[39m
  Expression: all(a .== b)

### Initializing `CuArray`s

In [48]:
@btime CuArray{Float64, 1}(undef, 10_000)
@btime CUDA.zeros(Float64, 10_000)
@btime CuArray(zeros(10_000))

  4.660 μs (4 allocations: 128 bytes)
  12.167 μs (22 allocations: 1.11 KiB)
  26.057 μs (7 allocations: 78.31 KiB)


10000-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0