In [1]:
from scipy.stats import ttest_ind
import numpy as np
from utils import partition
from perm_test import permInterval, pval, tconf

In [2]:
n, n1, n2 = 64, 12, 8
np.random.seed(123)
x1s = np.random.normal(size=(n, n1)).astype("f")
x2s = np.random.normal(size=(n, n2)).astype("f")
ts = ttest_ind(x1s, x2s, axis=-1).statistic
ts

array([ 0.02288745,  2.0926654 , -0.43440118,  0.60474104,  2.0320015 ,
       -1.2283934 ,  0.7599544 ,  0.34247264, -0.5637238 ,  0.064959  ,
        0.35305887,  2.3999612 , -1.6936734 ,  0.21588634,  1.1284194 ,
       -0.88623947, -0.7898006 ,  1.2211282 , -1.0268205 , -1.0317919 ,
        1.0080342 ,  1.1724267 , -0.22055963, -1.3208809 ,  1.6397921 ,
        0.24637549, -2.0423365 ,  1.224522  ,  0.6851538 , -0.0684469 ,
       -0.01847662, -0.37820017, -1.7693397 , -0.7219246 ,  1.3305384 ,
       -0.1906493 ,  0.50128424, -1.3641483 ,  0.5594391 ,  0.5448983 ,
        0.29982695, -1.9513791 , -0.49591184,  0.2882335 ,  0.748388  ,
       -0.86470735,  0.5089196 ,  0.18490031, -0.329755  , -0.00321723,
        1.9339507 ,  1.1011589 , -0.7417909 , -0.5083448 , -0.6956452 ,
        0.37978023,  0.71493536,  0.4924357 ,  0.596291  ,  0.5522851 ,
        1.1185036 , -1.480141  ,  0.83585465, -2.5387344 ], dtype=float32)

In [3]:
data = np.hstack((x1s, x2s))
data.shape

(64, 20)

In [4]:
x1, x2 = x1s[0], x2s[0]
t_obs = ttest_ind(x1, x2, axis=-1).statistic
t_obs

0.022887447471290987

In [5]:
parts = partition(n1, n2)
parts

array([[ 0,  1,  2, ..., 17, 18, 19],
       [ 0,  1,  2, ..., 17, 18, 19],
       [ 0,  1,  2, ..., 17, 18, 19],
       ...,
       [ 7,  8, 10, ...,  5,  6,  9],
       [ 7,  9, 10, ...,  5,  6,  8],
       [ 8,  9, 10, ...,  5,  6,  7]])

In [6]:
permInterval(x1, x2, parts, 0)

True

In [7]:
from numba import vectorize

@vectorize(['float32(float32)'],target='cuda')
def gpu_normalize(x):
    return x / 255

In [8]:
from numba import cuda

@cuda.jit(device=True)
def add(array): 
  acc = 0
  for val in array: 
    acc += abs(val)
  return math.sqrt(acc)

#@guvectorize(['(float32[:], float32[:])'],
#             '(n)->()',                
#             target='cuda')
@cuda.jit
def gpu_average(array, out):
    out[0] = add(array)/len(array)

In [55]:
from numba import cuda

@cuda.jit
def gpu_sqrt_kernel(x, out, parts):
    idx = cuda.grid(1)
    data = x[idx]
    x1, x2 = data[:n1], data[n1:]
    out[idx] = add(x1)

In [30]:
# move input data to the device
d_in = cuda.to_device(data)
# create output data on the device
d_out = cuda.device_array_like(d_in)
d_parts = cuda.to_device(parts)

# we decide to use 32 blocks, each containing 128 threads
blocks_per_grid = 32
threads_per_block = 128
gpu_sqrt_kernel[blocks_per_grid, threads_per_block](d_in, d_out, d_parts)
# wait for all threads to complete
cuda.synchronize()
# copy the output array back to the host system
# and print it
print(d_out.copy_to_host())

[[1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 ...
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]
 [1. 1. 1. ... 1. 1. 1.]]




In [69]:
@cuda.jit('float32(float32, float32, float32)', device=True)
def cu_device_fn(x, y, z):
    return x ** y / z

# define a ufunc that calls our device function
@vectorize(['float32(float32, float32, float32)'], target='cuda')
def cu_ufunc(x, y, z):
    return cu_device_fn(x, y, z)

In [70]:
cu_ufunc(10, 2, 2)



array([49.999996], dtype=float32)

In [54]:
from numba_stats import t

t.ppf(0.975, 1, 0, 1)

12.706204736432095

In [71]:
@vectorize(['float32(float32, int32, float32, float32)'], target='cuda')
def tcuda(p, df, mean, var):
    # return t.ppf(p, df, mean, var)
    return 1.0

In [72]:
tcuda(0.3, 1, 0, 1)



array([1.])