# Setup

In [80]:
Threads.nthreads()

8

In [81]:
using Distributed, Folds, FLoops
using Random, Distributions

In [82]:
include("perm_test.jl")
include("partition.jl")

partition (generic function with 1 method)

In [83]:
n, n1, n2 = 256, 12, 8
d = Normal()
delta_true = 0
parts = partition(n1, n2)
Random.seed!(123)
x1s = rand(d, (n, n1))
x2s = rand(d, (n, n2))
data = hcat(x1s, x2s)

256×20 Matrix{Float64}:
 -0.645731     1.34194    -0.19564   -1.05875    …  -1.35274    -0.370654
 -1.46325     -1.18862    -0.915458  -0.920206       1.14854    -0.921302
 -1.6236       1.18954    -0.167391   2.08291        0.297835   -0.483097
 -0.217665    -1.6712     -0.75522   -0.281233       0.425549    1.91035
  0.492246     0.190436   -1.00009    0.0599196     -0.260339   -1.41726
  0.98098     -1.27217    -0.498621   0.179808   …   1.58057    -0.460531
  0.0799568   -0.803196    0.802551  -0.437841      -0.936614   -0.243236
  1.54912      1.97076     0.146786   0.0807004     -1.3201      1.90304
 -1.34161      1.54823    -0.922078   1.58911       -0.114953    0.109298
  0.412162    -0.126723    1.22042    0.668593      -0.399888   -0.910424
  0.593197     0.174584   -0.223876   0.749044   …  -0.372004   -0.0788338
 -0.768409     0.267046   -0.966475  -0.163043      -1.16322    -0.253188
 -0.0761679    1.326      -0.950514   1.42283        1.16112     0.874681
  ⋮             

---

# CPU Programming

In [None]:
x = data[1,:]
x1, x2 = x[1:n1], x[n1+1:end]
@time permInterval(x1, x2, parts, delta_true)

In [None]:
@time @distributed (+) for i in 1:n
    permInterval(data[i, 1:n1], data[i, n1+1:end], parts, delta_true)
end

In [None]:
@time @floop for row in eachrow(data)
    @reduce(coverage += permInterval(row[1:n1], row[n1+1:end], parts, delta_true))
end

In [None]:
@time Folds.reduce(+, Folds.map(x -> permInterval(x[1:n1], x[n1+1:end], parts, delta_true), eachrow(data)))

---

# GPU Programming

In [20]:
using CUDA, Test, Random

N = 2^10
Random.seed!(123)
x1 = CuArray(rand(N, N))
x2 = CuArray(rand(N, N))

1024×1024 CuArray{Float64, 2, CUDA.Mem.DeviceBuffer}:
 0.919181   0.292579   0.825846   …  0.168067    0.104088   0.229287
 0.426019   0.149323   0.489151      0.235193    0.594976   0.617096
 0.746586   0.188196   0.699858      0.960604    0.556726   0.879213
 0.819201   0.566957   0.539838      0.990984    0.260809   0.941378
 0.954159   0.0496869  0.948309      0.0772474   0.262274   0.561383
 0.845895   0.506965   0.0744135  …  0.67193     0.217708   0.41441
 0.586749   0.0930013  0.0117718     0.432447    0.79591    0.176363
 0.121813   0.997974   0.398319      0.52053     0.80667    0.458327
 0.789493   0.709287   0.143511      0.838646    0.840307   0.190893
 0.619259   0.910902   0.241169      0.226593    0.0996554  0.564104
 0.477645   0.436953   0.220376   …  0.61497     0.89362    0.563459
 0.804193   0.673175   0.129352      0.846492    0.479529   0.34562
 0.123538   0.6756     0.303669      0.824918    0.986141   0.0999812
 ⋮                                ⋱               

## Student T Distribution

Note that this only covers computing the [T-test stastic](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html) and not calculations involving the actual T distribution (e.g., cdf, pdf, quantiles).

In [69]:
function mean_gpu(x, d)
    return sum(x, dims=d) ./= size(x)[d]
end

function sum_sq_gpu(x, d)
    return sum(x.^2, dims=d)
end

function var_gpu(x, n, mean, ss)
    return @. (ss - (n * mean^2)) / (n - 1)
end

var_gpu (generic function with 2 methods)

In [41]:
@test all(mean_gpu(x1, ndims(x1)) .== mean(x1, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: all(mean_gpu(x1, ndims(x1)) .== mean(x1, dims = 2))

In [48]:
@test all(sum_sq_gpu(x1, ndims(x1)) .== sum(x1.^2, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: all(sum_sq_gpu(x1, ndims(x1)) .== sum(x1 .^ 2, dims = 2))

In [79]:
d = ndims(x1)
n = size(x1)[d]
m = mean_gpu(x1, ndims(x1))
sq = sum_sq_gpu(x1, ndims(x1))
@test isapprox(var_gpu(x1, n, m, sq), var(x1, dims=2))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(var_gpu(x1, n, m, sq), var(x1, dims = 2))
   Evaluated: isapprox([0.0833572631339034; 0.08410008546151253; … ; 0.08270401746402671; 0.08124917847073232;;], [0.08335726313390343; 0.0841000854615125; … ; 0.08270401746402672; 0.08124917847073242;;])

In [74]:
function t_gpu(x1, x2, pooled)
    d = ndims(x1)
    mean1, mean2 = mean_gpu(x1, d), mean_gpu(x2, d)
    ss1, ss2 = sum_sq_gpu(x1, d), sum_sq_gpu(x2, d)
    
    n1, n2 = size(x1)[d], size(x2)[d]
    # TODO implement pooled variance
    var1, var2 = var_gpu(x1, n1, mean1, ss1), var_gpu(x2, n2, mean2, ss2)
    
    return @. (mean1 - mean2) / sqrt(var1/n1 + var2/n2)
end

t_gpu (generic function with 1 method)

In [77]:
@test isapprox(t_gpu(x1, x2, false), ttest_ind(x1, x2, false))

[32m[1mTest Passed[22m[39m
  Expression: isapprox(t_gpu(x1, x2, false), ttest_ind(x1, x2, false))
   Evaluated: isapprox([1.206540586996704; -0.9959457166403523; … ; 0.09742026421765294; 0.4959504570725251;;], [1.206540586996704; -0.9959457166403523; … ; 0.09742026421765293; 0.49595045707252494;;])

---

In [None]:
function gpu_sum!(x, y, )
    """Computes the sum of the 1-D vector x and stores in y[1]"""
    index = (blockIdx().x-1) * blockDim().x + threadIdx().x
    stride = gridDim().x * blockDim().x
    for i = index:stride:length(x)
        @inbounds y[1] += x[i]
    end
end

In [None]:
numblocks = ceil(Int, N/256)
@cuda threads=256 blocks=numblocks gpu_sum!(x, y)
#@test all(sum(x) .== y)
@test all(x .== y)

---

In [None]:
function bench_gpu3!(y, x)
    numblocks = ceil(Int, length(y)/256)
    CUDA.@sync begin
        @cuda threads=256 blocks=numblocks gpu_add3!(y, x)
    end
end

In [None]:
using BenchmarkTools

@btime bench_gpu3!($y_d, $x_d)