In [1]:
using CUDA, Random, Test
CUDA.allowscalar(false)

In [2]:
include("perm_test.jl")
include("partition.jl")

partition (generic function with 1 method)

In [3]:
Random.seed!(123)
N = 1024
n1, n2 = 12, 8
x1, x2 = CuArray(rand(N, n1)), CuArray(rand(N, n2))

([0.906299638797481 0.18750281732422014 … 0.5941010362656459 0.17323597986665784; 0.44349373245960455 0.498439397828113 … 0.35202055060070614 0.31073836950193956; … ; 0.3589185993287528 0.1114694679461018 … 0.6439746082053515 0.5539659994059574; 0.035927613151065296 0.7081160306383704 … 0.8242544528646908 0.9229874487702134], [0.9191811597933967 0.2925787758936972 … 0.741307376507605 0.9790050449921868; 0.42601923740838954 0.14932329779230624 … 0.7726209816563904 0.4140158502518958; … ; 0.0769627114182928 0.7625196951584902 … 0.16054705848932704 0.926818712518206; 0.48657054103959807 0.015825668052687814 … 0.8230914230335877 0.8461944332763968])

In [109]:
p1, p2 = partition(n1, n2)
p1, p2 = CuArray(p1), CuArray(p2)

([1 2 … 11 12; 1 2 … 11 13; … ; 8 10 … 19 20; 9 10 … 19 20], [20 19 … 14 13; 20 19 … 14 12; … ; 9 7 … 2 1; 8 7 … 2 1])

In [5]:
a, b = x1[1,:], x2[1,:]
wide_lo, wide_hi = tconf(a, b, alpha=0.01, pooled=false)
narrow_lo, narrow_hi = tconf(a, b, alpha=0.1, pooled=false)

(-0.3758134181173361, 0.15103016052040058)

In [6]:
search(a, b, p1, p2, wide_lo, narrow_lo; pooled=false)

-0.5987410951846774

In [7]:
permInterval(a, b, p1, p2, 0)

true

In [8]:
c = x1[:,1]
d = x2[:,1]

1024-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
 0.9191811597933967
 0.42601923740838954
 0.7465863168484385
 0.8192008435733007
 0.9541592624784624
 0.8458950823510601
 0.586748700554989
 0.12181263792534469
 0.7894931647499924
 0.6192588014774537
 0.47764531343080796
 0.8041934288169849
 0.12353811904859124
 ⋮
 0.7226240454902878
 0.21388976332386556
 0.14177711414972805
 0.78587303920342
 0.592764826769585
 0.2528427068581415
 0.14147239723522842
 0.34833389389379155
 0.4733751048473892
 0.2989977739879336
 0.0769627114182928
 0.48657054103959807

In [9]:
function row_sum!(x, n, out)
    """out = sum(x, dims=2)"""
    row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    for i = 1:n
        @inbounds out[row_idx] += x[row_idx, i]
    end
    return
end

row_sum! (generic function with 1 method)

In [10]:
y = CuArray{Float64, 1}(zeros(N))
@cuda threads=256 blocks=4 row_sum!(x1, size(x1)[2], y)
@test isapprox(y, sum(x1, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(y, sum(x1, dims = 2))
   Evaluated: isapprox([6.53110238678372, 6.377362795307128, 6.919564267865844, 6.174720415752943, 7.817461484498605, 5.293765801948105, 4.557146998711857, 7.688457256848162, 5.4602616578311665, 4.1431747027884915  …  3.979767404455261, 5.578894708504509, 5.426513198117829, 6.230180953961174, 6.318454449768334, 6.759367514489274, 6.3418690026911335, 7.059379878760508, 7.5059121052625155, 5.577208109503038], [6.53110238678372; 6.3773627953071275; … ; 7.5059121052625155; 5.577208109503038;;])

In [11]:
function mean!(x, n, out)
    """out = sum(x, dims=2)"""
    row_idx = (blockIdx().x-1) * blockDim().x + threadIdx().x
    for i = 1:n
        @inbounds out[row_idx] += x[row_idx, i]
    end
    out[row_idx] /= n
    return
end

mean! (generic function with 1 method)

In [12]:
y = CuArray{Float64, 1}(zeros(N))
@cuda threads=256 blocks=4 mean!(x1, size(x1)[2], y)
@test isapprox(y, mean(x1, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(y, mean(x1, dims = 2))
   Evaluated: isapprox([0.5442585322319767, 0.5314468996089273, 0.576630355655487, 0.5145600346460786, 0.651455123708217, 0.44114715016234207, 0.3797622498926547, 0.6407047714040135, 0.45502180481926385, 0.34526455856570765  …  0.3316472837046051, 0.4649078923753758, 0.4522094331764857, 0.5191817461634312, 0.5265378708140278, 0.5632806262074396, 0.5284890835575945, 0.5882816565633756, 0.6254926754385429, 0.46476734245858653], [0.5442585322319767; 0.5314468996089273; … ; 0.6254926754385429; 0.46476734245858653;;])

In [15]:
function t_gpu(x1s, x2s, out; pooled=false)
    n = size(x1s)[ndims(x1s)]

    mean1 = CuArray{Float64, 1}(undef, N)
    var1 = var(x1s, mean=mean1, dims=d)

    mean2 = mean(x2s, dims=d)
    var2 = var(x2s, mean=mean2, dims=d)

    n1, n2 = size(x1s)[d], size(x2s)[d]  # number of observations per group

    if pooled
        pooled_var = ((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2)
        denom = sqrt.(pooled_var * (1/n1 + 1/n2))
    else
        denom = sqrt.(var1/n1 + var2/n2)
    end

    return (mean1-mean2)./denom
    return
end

t_gpu (generic function with 1 method)

---

## Using intermediate kernel results

In [19]:
function add!(a, b, c)
    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
    c[i] = a[i] + b[i]
    return
end

N = 1024
a = CuArray{Float64, 1}(zeros(N))
b = CuArray{Float64, 1}(ones(N))
c = CuArray{Float64, 1}(zeros(N))
@cuda threads=256 blocks=4 add!(a, b, c)
@test all(c .== a .+ b)  # all tests passed

[32m[1mTest Passed[22m[39m
  Expression: all(c .== a .+ b)

In [98]:
CuArray{Float64, 1}(undef, 10)

10-element CuArray{Float64, 1, CUDA.Mem.DeviceBuffer}:
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0
 0.0

In [107]:
function g(x, y)
    """ mean(x, dims=2) + mean(y, dims=2) """
    xrow, xcol = size(x)
    yrow, ycol = size(y)
    @assert xrow == yrow
    
    mean1 = CuArray{Float64, 1}(undef, xrow)
    @cuda threads=256 blocks=4 mean!(x, xcol, mean1)

    mean2 = CuArray{Float64, 1}(undef, yrow)
    @cuda threads=256 blocks=4 mean!(y, ycol, mean2)

    out = CuArray{Float64, 1}(undef, yrow)
    @cuda threads=256 blocks=4 add!(mean1, mean2, out)
    return out
end

g (generic function with 1 method)

In [108]:
@test isapprox(g(x1, x2), mean(x1, dims=2) + mean(x2, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(g(x1, x2), mean(x1, dims = 2) + mean(x2, dims = 2))
   Evaluated: isapprox([1.2009086932624213, 1.0659361742815818, 1.0739059719252197, 1.1184596251822434, 1.3727530075444005, 1.0021220425445314, 0.8147574384599667, 1.2328351698408144, 0.9419814101235331, 0.9002954819054065  …  0.7415527951910228, 0.9886823145719495, 0.9173212039938446, 1.071017180774851, 1.0674323495850333, 0.9529897318850561, 1.0975360635571954, 1.0239764750683034, 1.2069555542367216, 1.0296984624043461], [1.2009086932624213; 1.0659361742815818; … ; 1.2069555542367216; 1.0296984624043461;;])