# Custom Kernels

## Setup

In [1]:
using CUDA, Random
using Test, BenchmarkTools
CUDA.allowscalar(false)

In [17]:
include("perm_test.jl")
include("partition.jl")
include("kernels/math.jl")
include("kernels/statistics.jl")



Main.PermTestFast

In [3]:
Threads.nthreads()

8

In [3]:
CUDA.memory_status()

Effective GPU memory usage: 0.73% (59.125 MiB/7.936 GiB)
No memory pool is in use.

---

## Initialize Data

In [3]:
function set_thread_block(len, nthreads=256)
    nblocks = ceil(Int, len / nthreads)
    return nthreads, nblocks
end

set_thread_block (generic function with 2 methods)

In [None]:
N = 2^15                    # num. samples
T, B = set_thread_block(N)
nx, ny = 12, 8              # sample sizes for each group
x, y = CUDA.rand(Float64, N, nx), CUDA.rand(Float64, N, ny)
px, py = partition(nx, ny)
px, py = CuArray(px), CuArray(py)

---

## Kernels

### Mean & Variance

In [16]:
function mean!(x, ncol, out)
    """out = sum(x, dims=2)"""
    row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    for i = 1:ncol
        @inbounds out[row_idx] += x[row_idx, i]
    end
    @inbounds out[row_idx] /= ncol
    return
end

out = CUDA.zeros(Float64, N)
@cuda threads=T blocks=B mean!(x, size(x)[2], out)
@test isapprox(out, mean(x, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(out, mean(x, dims = 2))
   Evaluated: isapprox([0.39903970495482377, 0.6069146944620198, 0.41372937634873236, 0.5681037409491377, 0.6186572467013405, 0.4411451109421142, 0.5648831272101186, 0.47154206050713604, 0.37457577917132684, 0.5092191357925955  …  0.5021919976200732, 0.38228426267569066, 0.4512002267043255, 0.6576276406376775, 0.6359889766382011, 0.5720421900507177, 0.591892831811546, 0.4266947053424997, 0.22019978800251103, 0.6031515087441858], [0.39903970495482377; 0.6069146944620198; … ; 0.22019978800251103; 0.6031515087441858;;])

In [17]:
function sumsq!(x, ncol, out)
    """out = sum(x, dims=2)"""
    row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    for i = 1:ncol
        @inbounds out[row_idx] += x[row_idx, i]^2
    end
    return
end

out = CUDA.zeros(Float64, N)
@cuda threads=T blocks=B sumsq!(x, size(x)[2], out)
@test isapprox(out, sum(x.^2, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(out, sum(x .^ 2, dims = 2))
   Evaluated: isapprox([3.0239137401401934, 5.38797083034672, 3.0044107687385257, 4.616127720145396, 5.5713699407159165, 3.8169105546002835, 4.60812318334853, 3.743956590830854, 2.051511271512022, 4.0939642255907325  …  4.1386578720074665, 2.8246693036900554, 3.770801456226647, 6.056627665647238, 5.662241973549224, 4.667150421992148, 5.293743629207229, 3.1609596708238903, 1.1049308466357568, 5.296105671737127], [3.0239137401401934; 5.38797083034672; … ; 1.104930846635757; 5.296105671737127;;])

In [18]:
function _var!(n, ss, means, out)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds out[i] = (ss[i] - (n * means[i]^2)) / (n-1)
    return
end

function var_gpu(x)
    nrow, ncol = size(x)
    T, B = set_thread_block(nrow)
    ss = CUDA.zeros(Float64, size(x)[1])
    @cuda threads=T blocks=B sumsq!(x, ncol, ss)

    means = CUDA.zeros(Float64, nrow)
    @cuda threads=T blocks=B mean!(x, ncol, means)

    vars = CUDA.zeros(Float64, nrow)
    @cuda threads=T blocks=B _var!(ncol, ss, means, vars)
    return vars, means
end

z = CUDA.rand(Float64, 300, 20)  # works with arbitrary matrix size
@test isapprox(var_gpu(z)[1], var(z, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox((var_gpu(z))[1], var(z, dims = 2))
   Evaluated: isapprox([0.10597717250470276, 0.10797098402035558, 0.11729883932098417, 0.07096396287729326, 0.06314298723519225, 0.08494711449979803, 0.09847793582518864, 0.09699750377609896, 0.07571370942514459, 0.10275349677446104  …  0.08015847853500102, 0.08998300752288253, 0.06519248962754598, 0.08073784142811168, 0.084488973037789, 0.06402254893840029, 0.070524029914996, 0.04532933769175287, 0.09791870235931216, 0.0890230268805578], [0.1059771725047027; 0.1079709840203555; … ; 0.09791870235931223; 0.08902302688055781;;])

### t Test Statistic

In [33]:
function t_gpu(x, y, pooled=false)
    varx, meanx = var_gpu(x)
    vary, meany = var_gpu(y)
    nx, ny = size(x, 2), size(y, 2)
    T, B = set_thread_block(size(x,1))
    
    if pooled
        # TODO
    end
    
    @cuda threads=T blocks=B div!(varx, nx)          # varx ./= nx
    @cuda threads=T blocks=B div!(vary, ny)          # vary ./= ny
    @cuda threads=T blocks=B add_arr!(varx, vary)    # varx .+= vary
    @cuda threads=T blocks=B sqrt!(varx)             # varx = sqrt.(varx)
    @cuda threads=T blocks=B sub_arr!(meanx, meany)  # meanx .-= meany
    @cuda threads=T blocks=B div_arr!(meanx, varx)   # meanx ./= varx
    return meanx
end

t_gpu (generic function with 2 methods)

In [62]:
Random.seed!(123)
a = CUDA.rand(1)

1-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
 0.21063626

In [69]:
Random.seed!(123)
a = CuArray(rand(Float64, 100, 200))
b = CuArray(rand(Float64, 100, 200))
pooled = false
@test isapprox(
    t_gpu(a, b, pooled),
    ttest_ind(a, b, pooled)
)

[91m[1mTest Failed[22m[39m at [39m[1mIn[69]:5[22m
  Expression: isapprox(t_gpu(a, b, pooled), ttest_ind(a, b, pooled))
   Evaluated: isapprox([-0.23564354671874313, -0.7314884559922012, -0.6228620369323843, -0.6587417734497699, -1.37786466581233, -0.48673625287476424, 1.80195255483362, 0.8087199602797334, -3.0548390945889685, 2.4099816054025234  …  0.5748928006581614, 0.016612625978835775, 1.208724597157396, -0.4139906405648631, 0.6799404971542194, -1.3143179053056688, 0.7016588526679763, -2.8257266898253177, -0.5470782793543589, -1.3385727384807207], [-0.23258194451924272; -0.7210603008419425; … ; -0.5396824985932083; -1.3193429258769838;;])


LoadError: [91mThere was an error during testing[39m

### Permutation test p-value

In [31]:
function pval_gpu(x, y, px, py; pooled=false, alternative="two-sided", delta=0)
    x_shift = copy(x)
    T, B = set_thread_block(size(x,1))
    @cuda threads=T blocks=B sub_gpu!(x_shift, delta)
    t_obs = t_gpu(x_shift', y', pooled)
    return t_obs
end

pval_gpu(x[1,:], y[1,:], px, py)

1-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
 NaN

In [None]:
    x1_shifted = x1 .- delta             # shift group 1 under null hypothesis
    t_obs = ttest_ind(x1_shifted, x2, pooled)  # test statistic for observed data
    # println(t_obs)

    combined = vcat(x1_shifted, x2)  # join original pair into single vector
    x1s = combined[parts1]   # get all combinations of pairs from original pair
    x2s = combined[parts2]
    ts = ttest_ind(x1s, x2s, pooled)   # test statistic for all possible pairs of samples

    if alternative == "smaller"
        n_extreme = count(ts .<= t_obs)
    elseif alternative == "larger"
        n_extreme = count(ts .>= t_obs)
    else
        n_extreme = count(@. (ts <= -abs(t_obs)) | (ts >= abs(t_obs)))
    end

    return n_extreme / size(parts1)[1]  # proportion of pairs w/ extreme test statistic

---

## Benchmarks

### t Test Statistic: `@cuda` vs. `@.` vectorized

In [51]:
@btime t_gpu($x, $y, pooled=$pooled)
@btime ttest_ind($x, $y, $pooled)

  248.894 μs (428 allocations: 22.92 KiB)
  706.026 μs (340 allocations: 19.33 KiB)


32768×1 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 -0.23448901664480554
  0.05901404020274746
  0.38193147235097874
 -1.1025761619685284
 -0.37842271181044496
 -0.5640191022990872
  0.9204858513562348
 -0.5540337831883215
 -2.022682039847497
 -0.30371442721737457
  1.5446552677841405
 -0.3557875158409345
 -1.23487851962008
  ⋮
 -0.41275974205144184
 -1.993862076306121
 -0.7057823538090785
 -0.6766744521720237
  2.142521777498816
 -1.7249230335734638
  1.0848105576581777
  1.2222689568584562
 -0.5442251380071487
  0.11810933367877134
  0.5399342464957391
  1.2943106132842193

### Copying

In [23]:
a = CUDA.rand(N)
b = CUDA.zeros(N)

@btime copyto!($b, $a)
@btime @cuda threads=T blocks=B copy_arr_gpu!(b, a)
@test all(a .== b)

  2.525 μs (0 allocations: 0 bytes)
  6.128 μs (24 allocations: 1.28 KiB)


[32m[1mTest Passed[22m[39m
  Expression: all(a .== b)

### Initializing `CuArray`s

In [48]:
@btime CuArray{Float64, 1}(undef, 10_000)
@btime CUDA.zeros(Float64, 10_000)
@btime CuArray(zeros(10_000))

  4.660 μs (4 allocations: 128 bytes)
  12.167 μs (22 allocations: 1.11 KiB)
  26.057 μs (7 allocations: 78.31 KiB)


10000-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 ⋮
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0