In [2]:
using CUDA, Random, Test
CUDA.allowscalar(false)

In [3]:
include("perm_test.jl")
include("partition.jl")

partition (generic function with 1 method)

In [5]:
Random.seed!(123)
N = 1024
n1, n2 = 12, 8
x1, x2 = CUDA.rand(Float64, N, n1), CUDA.rand(Float64, N, n2)

([0.33191841660403193 0.13993236241073398 … 0.3266274977484425 0.07435057735085088; 0.09758743943216291 0.7245063305784085 … 0.4537553121217804 0.6003845859788253; … ; 0.8944203199277037 0.3779703845290438 … 0.45657134590782905 0.9976296889782996; 0.5759949577993901 0.7966846977630275 … 0.3319420113131965 0.15994894964580547], [0.4543866769965745 0.2636475562764177 … 0.03507011170581459 0.35581133995783737; 0.7200073576520345 0.8690257846587579 … 0.10709625780543425 0.19803005445043637; … ; 0.5803647031417776 0.6960132011019047 … 0.7807831392991891 0.724291588808329; 0.6986291202814392 0.8845014540596929 … 0.6516510865208662 0.8267851665796302])

In [6]:
p1, p2 = partition(n1, n2)
p1, p2 = CuArray(p1), CuArray(p2)

([1 2 … 11 12; 1 2 … 11 13; … ; 8 10 … 19 20; 9 10 … 19 20], [20 19 … 14 13; 20 19 … 14 12; … ; 9 7 … 2 1; 8 7 … 2 1])

In [71]:
function mean!(x, n, out)
    """out = sum(x, dims=2)"""
    row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    for i = 1:n
        @inbounds out[row_idx] += x[row_idx, i]
    end
    @inbounds out[row_idx] /= n
    return
end

mean! (generic function with 1 method)

In [72]:
y = CuArray{Float64, 1}(zeros(N))
@cuda threads=256 blocks=4 mean!(x1, size(x1)[2], y)
@test isapprox(y, mean(x1, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(y, mean(x1, dims = 2))
   Evaluated: isapprox([0.4158057617914684, 0.54089452534108, 0.4524898082598267, 0.5083081472702323, 0.4380688443521326, 0.3569448543013704, 0.4974078531665431, 0.5379865245441388, 0.33865822212714064, 0.45825727998292787  …  0.5096949390638388, 0.4583620706167644, 0.4325325430212037, 0.40978602133159847, 0.6342294446970657, 0.39765892264072716, 0.5147646691329809, 0.6389884962774152, 0.6543238609464797, 0.46963423479479843], [0.4158057617914685; 0.5408945253410798; … ; 0.6543238609464798; 0.4696342347947984;;])

In [73]:
function sumsq!(x, n, out)
    """out = sum(x, dims=2)"""
    row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    for i = 1:n
        @inbounds out[row_idx] += x[row_idx, i]^2
    end
    return
end

sumsq! (generic function with 1 method)

In [74]:
y = CuArray{Float64, 1}(zeros(N))
@cuda threads=256 blocks=4 sumsq!(x1, size(x1)[2], y)
@test isapprox(y, sum(x1.^2, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(y, sum(x1 .^ 2, dims = 2))
   Evaluated: isapprox([2.9576283927036604, 4.349671309954059, 3.4525589483156507, 3.850952389300226, 3.2894621976014125, 2.7047426810673163, 3.6506137435184924, 4.1168173519881215, 2.4138347381971705, 3.3750119187813126  …  4.152528523319621, 3.656978200633897, 3.2408852423456094, 3.1151688567702163, 6.060194161149632, 2.404153347197356, 3.8251480857457043, 5.917030941377447, 6.061911574825796, 3.6821869867142154], [2.9576283927036604; 4.349671309954059; … ; 6.061911574825795; 3.682186986714216;;])

In [212]:
function _var!(n, ss, means, out)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds out[i] = (ss[i] - (n * means[i]^2)) / (n-1)
    return
end

_var! (generic function with 1 method)

In [213]:
function var_gpu(x)
    nrow, ncol = size(x)
    ss = CUDA.zeros(Float64, size(x)[1])
    @cuda threads=256 blocks=4 sumsq!(x, ncol, ss)

    means = CUDA.zeros(Float64, nrow)
    @cuda threads=256 blocks=4 mean!(x, ncol, means)

    vars = CUDA.zeros(Float64, nrow)
    @cuda threads=256 blocks=4 _var!(ncol, ss, means, vars)
    return vars, means
end

var_gpu (generic function with 1 method)

In [214]:
@test isapprox(var_gpu(x2)[1], var(x2, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox((var_gpu(x2))[1], var(x2, dims = 2))
   Evaluated: isapprox([0.08940683594148886, 0.1369454638998972, 0.0867649256335066, 0.11092958845303122, 0.06413028866107497, 0.06094058779485038, 0.10847738630726989, 0.11513032767354854, 0.03873637150156105, 0.11423126476380416  …  0.13785538796358937, 0.10591744565462413, 0.06977934626941026, 0.08077133833936598, 0.11205752476026896, 0.062380245995309655, 0.09279517748881325, 0.07773276376463945, 0.02578348553925469, 0.05013598112000585], [0.08940683594148888; 0.1369454638998971; … ; 0.025783485539254747; 0.05013598112000578;;])

In [298]:
function mul_gpu!(x, val)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] *= val
    return
end

function mul_arr_gpu!(x, y)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] *= y[i]
    return
end

function div_gpu!(x, val)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] /= val
    return
end

function div_arr_gpu!(x, y)
    """ out = x ./ y """
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] /= y[i]
    return
end

function sqrt_gpu!(x)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds x[i] = sqrt(x[i])
    return
end

function add_gpu!(a, b, c)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds c[i] = a[i] + b[i]
    return
end

function subtract_gpu!(a, b, c)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds c[i] = a[i] - b[i]
    return
end

function copy_arr_gpu!(dest, source)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    @inbounds dest[i] = source[i]
    return
end

copy_arr_gpu! (generic function with 1 method)

In [319]:
using BenchmarkTools

K = 2^25
x = CUDA.rand(K)
y = CUDA.zeros(K)

@btime copyto!($y, $x)
@btime @cuda threads=256 blocks=Int(K/256) copy_arr_gpu!(y, x)
@test all(x .== y)

  2.811 μs (0 allocations: 0 bytes)
  7.392 μs (27 allocations: 1.33 KiB)


[32m[1mTest Passed[22m[39m
  Expression: all(x .== y)

In [316]:
function t_gpu(x, y; pooled=false)
    varx, meanx = var_gpu(x)
    vary, meany = var_gpu(y)
    nx, ny = size(x)[2], size(y)[2]
    
    if pooled
        @cuda threads=256 blocks=4 mul_gpu!(varx, nx-1)
        @cuda threads=256 blocks=4 mul_gpu!(vary, ny-1)
        @cuda threads=256 blocks=4 add_gpu!(varx, vary, varx)   # varx = (nx-1)*varx + (ny-1)*vary
        @cuda threads=256 blocks=4 div_gpu!(varx, nx+ny-2)      # varx /= nx+ny-2
        #@cuda threads=256 blocks=4 copy_arr_gpu!(vary, varx)
        copyto!(vary, varx)
    end
        
    @cuda threads=256 blocks=4 div_gpu!(varx, nx)
    @cuda threads=256 blocks=4 div_gpu!(vary, ny)
    
    denom = CUDA.zeros(Float64, size(x)[1])
    @cuda threads=256 blocks=4 add_gpu!(varx, vary, denom)
    @cuda threads=256 blocks=4 sqrt_gpu!(denom)
    
    t = CUDA.zeros(Float64, size(x)[1])
    @cuda threads=256 blocks=4 subtract_gpu!(meanx, meany, t)
    @cuda threads=256 blocks=4 div_arr_gpu!(t, denom)
    return t
end

t_gpu (generic function with 1 method)

In [317]:
pooled = false
@test isapprox(t_gpu(x1, x2, pooled=pooled), ttest_ind(x1, x2, pooled))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(t_gpu(x1, x2, pooled = pooled), ttest_ind(x1, x2, pooled))
   Evaluated: isapprox([0.18680399704742479, 0.6216597446398325, -0.6683190430389726, 0.1884296089524259, 0.16093649485200498, -1.2531079746102525, -0.19795712004603416, 0.5228226162731499, -0.7802173274332709, -0.8566160615959619  …  0.387100680745915, -0.7620851214754379, -1.927133042432487, -0.7962851993598136, 0.540963939159896, 0.5711508187115787, 1.8634379916531385, 1.0956321103730968, -0.048472174040136685, -2.5286926460914128], [0.1868039970474261; 0.6216597446398315; … ; -0.0484721740401356; -2.5286926460914136;;])