In [1]:
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

In [2]:
mod = SourceModule("""
  #include <stdio.h>
  #include <math.h>

  __global__ void matmul(float *a, float *b, float *c, int *a_shape, int *b_shape)
  {
      if((blockDim.y * blockIdx.y + threadIdx.y) < a_shape[0] && (blockDim.x * blockIdx.x + threadIdx.x) < b_shape[1])
      {
        int aMin = (blockDim.y * blockIdx.y + threadIdx.y) * a_shape[1]; 
        int aMax = (blockDim.y * blockIdx.y + threadIdx.y + 1) *  a_shape[1]; 
        int aStep = 1;
        int bMin = blockDim.x * blockIdx.x + threadIdx.x;
        int bMax = blockDim.x * blockIdx.x + threadIdx.x + b_shape[0]*b_shape[1];
        int bStep = b_shape[1];
        float temp = 0;
        for(int ai=aMin, bi = bMin; ai < aMax && bi < bMax; ai += aStep, bi += bStep)
        {
                temp += a[ai] * b[bi];
        }
        int a_index = (blockDim.y * blockIdx.y + threadIdx.y) * b_shape[1];
        c[a_index+bMin] = temp;
    } 
  }
  __global__ void transpose(float *a, float *a_T, int *a_shape) 
  {
      int elem_idx = (blockDim.y * blockIdx.y + threadIdx.y) * a_shape[1] +  blockDim.x * blockIdx.x + threadIdx.x;
      if (elem_idx < a_shape[0]*a_shape[1]) 
          {
              int a_t_1 = a_shape[0];
              int elem_tr_idx =  (blockDim.x * blockIdx.x + threadIdx.x) * a_t_1 +  blockDim.y * blockIdx.y + threadIdx.y;
              a_T[elem_tr_idx] = a[elem_idx];
          }
  
  }
  
  __global__ void row_mean(float *a, float *mean, int *a_shape)
  {
  //Returns a column
      int row_num = (blockDim.x * blockIdx.x + threadIdx.x);
      if (row_num < a_shape[0])
      {
          int start_idx = row_num*a_shape[1];
          int end_idx = start_idx + a_shape[1];
          float sum = 0;
          for (int i = start_idx; i< end_idx; i++) 
          {
              sum += a[i];
          }
          mean[row_num] = sum/a_shape[1];
      }
  }
  
  __global__ void column_mean(float *a, float *mean, int *a_shape)
  {
  //Returns a row
      int col_num = (blockDim.x * blockIdx.x + threadIdx.x);
      if (col_num < a_shape[1])
      {
          int start_idx = col_num;
          int end_idx = start_idx + a_shape[1]*a_shape[0];
          float sum = 0;
          for (int i = start_idx; i< end_idx; i+= a_shape[1]) 
          {
              sum += a[i];
          }
          mean[col_num] = sum/a_shape[0];
      }
  }
  
  __global__ void min_row(float *a, int *a_shape, float *min_row, int *arg_min)
  {
    //Returns a column for min_row and argmin 
      int row_num = (blockDim.x * blockIdx.x + threadIdx.x);
      if (row_num < a_shape[0])
      {
          int start_idx = row_num*a_shape[1];
          int end_idx = start_idx + a_shape[1];
          min_row[row_num] = a[start_idx];
          arg_min[row_num] = 0;
          for (int col = start_idx+1, index=1; col< end_idx, index < a_shape[1]; col++, index ++) 
          {
              if (a[col] < min_row[row_num])
              {
                  min_row[row_num] = a[col];
                  arg_min[row_num] = index;
              }
          }
      }
  
  }
  
  __global__ void sum_axis3(float *a, int *a_shape, float *result)
  {
      //a[i][j][k] = k+a_shape[2]*j + a_shape[2]*a_shape[1]*i
      
      int col_num = (blockDim.x * blockIdx.x + threadIdx.x);
      int row_num = (blockDim.y * blockIdx.y + threadIdx.y);
      if (row_num < a_shape[0] && col_num < a_shape[1])
      {
          int start_idx =(row_num*a_shape[1] + col_num)*a_shape[2];
          int end_idx = start_idx + a_shape[2];
          int step = 1;
          float temp = 0;
          for (int idx = start_idx; idx < end_idx; idx+= step) 
          {
              temp += a[idx];
          }
          result[row_num*a_shape[1] + col_num] = temp;
      }
  
  }
  
    __global__ void sum_axis2(float *a, int *a_shape, float *result)
  {
      //a[i][j][k] = k+a_shape[2]*j + a_shape[2]*a_shape[1]*i
      
      int col_num = (blockDim.x * blockIdx.x + threadIdx.x);
      int row_num = (blockDim.y * blockIdx.y + threadIdx.y);
      if (row_num < a_shape[0] && col_num < a_shape[2])
      {
          int start_idx =row_num*a_shape[1]*a_shape[2] + col_num;
          int end_idx = start_idx + a_shape[2]*a_shape[1];
          int step = a_shape[2];
          float temp = 0;
          for (int idx = start_idx; idx < end_idx; idx+= step) 
          {
              temp += a[idx];
          }
          result[row_num*a_shape[2] + col_num] = temp;
      }
  
  }
  
    __global__ void sum_axis1(float *a, int *a_shape, float *result)
  {
      //a[i][j][k] = k+a_shape[2]*j + a_shape[2]*a_shape[1]*i
      
      int col_num = (blockDim.x * blockIdx.x + threadIdx.x);
      int row_num = (blockDim.y * blockIdx.y + threadIdx.y);
      if (row_num < a_shape[1] && col_num < a_shape[2])
      {
          int start_idx =(row_num)*a_shape[2] + col_num;
          int end_idx = start_idx + a_shape[2]*a_shape[1]*a_shape[0];
          int step = a_shape[2]*a_shape[1];
          float temp = 0;
          for (int idx = start_idx; idx < end_idx; idx+= step) 
          {
              temp += a[idx];
          }
          result[row_num*a_shape[2] + col_num] = temp;
      }
  
  }
  
      __global__ void argmin_mu_diff(float *data, float *mu, int *data_shape, int *mu_shape, int *arg_min)
  {
      
      int data_id = blockDim.x * blockIdx.x + threadIdx.x;
      if (data_id < data_shape[0] )
      {
          int startIdx = (blockDim.x * blockIdx.x + threadIdx.x)*data_shape[1];
          float min_diff = INT_MAX;
          float arg_min_diff = -1;
          for (int i=0; i<mu_shape[0]; i++) 
          {
              float diff = 0;
              for (int dim = 0; dim < mu_shape[1]; dim ++)
              {
                  diff += (data[startIdx+dim] - mu[i*mu_shape[1] + dim])*(data[startIdx+dim] - mu[i*mu_shape[1] + dim]);
              }
              if (diff < min_diff)
              {
                  min_diff = diff;
                  arg_min_diff = i;
              }
          }
          arg_min[data_id] = arg_min_diff;
      }
  
  }
  
  
  """)

In [None]:
a = np.random.randn(10549, 8982).astype(np.float32)
b = np.random.randn(8982, 10549).astype(np.float32)
c = np.zeros([a.shape[0], b.shape[1]]).astype(np.float32)
SHAPE_A = np.array(a.shape).astype(np.uint32)
SHAPE_B = np.array(b.shape).astype(np.uint32)
print(SHAPE_A, SHAPE_B)

In [None]:
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)
b_gpu = cuda.mem_alloc(b.nbytes)
cuda.memcpy_htod(b_gpu, b)
c_gpu = cuda.mem_alloc(c.nbytes)
cuda.memcpy_htod(c_gpu, c)

SHAPE_A_gpu = cuda.mem_alloc(SHAPE_A.nbytes)
cuda.memcpy_htod(SHAPE_A_gpu, SHAPE_A)
SHAPE_B_gpu = cuda.mem_alloc(SHAPE_B.nbytes)
cuda.memcpy_htod(SHAPE_B_gpu, SHAPE_B)

In [None]:
func = mod.get_function("matmul")

In [None]:
BLOCK_DIMX = 32
BLOCK_DIMY = 32
GRID_DIMX = int(np.ceil(b.shape[1]/float(BLOCK_DIMX)))
GRID_DIMY = int(np.ceil(a.shape[0]/float(BLOCK_DIMY)))

print (GRID_DIMX, GRID_DIMY)

In [None]:
%%time
func(a_gpu, b_gpu, c_gpu, SHAPE_A_gpu, SHAPE_B_gpu, block=(BLOCK_DIMX, BLOCK_DIMY, 1), grid=(GRID_DIMX, GRID_DIMY, 1))

In [None]:
results = np.empty_like(c)
cuda.memcpy_dtoh(results, c_gpu)
print(results)
print(np.allclose(np.matmul(a, b), results, atol=1e-2))

In [None]:
%%time
np.matmul(a, b)

In [None]:
##TRANSPOSE##
a = np.random.randn(10549, 8982).astype(np.float32)
a_T = np.zeros(list(reversed(a.shape))).astype(np.float32)
SHAPE_A = np.array(a.shape).astype(np.uint32)

In [None]:
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)
a_T_gpu = cuda.mem_alloc(a_T.nbytes)
cuda.memcpy_htod(a_T_gpu, a_T)
SHAPE_A_gpu = cuda.mem_alloc(SHAPE_A.nbytes)
cuda.memcpy_htod(SHAPE_A_gpu, SHAPE_A)

In [None]:
func = mod.get_function("transpose")
BLOCK_DIMX = 32
BLOCK_DIMY = 32
GRID_DIMX = int(np.ceil(a.shape[1]/float(BLOCK_DIMX)))
GRID_DIMY = int(np.ceil(a.shape[0]/float(BLOCK_DIMY)))
print (GRID_DIMX, GRID_DIMY)



In [None]:
%%time
func(a_gpu, a_T_gpu, SHAPE_A_gpu, block=(BLOCK_DIMX, BLOCK_DIMY, 1), grid=(GRID_DIMX, GRID_DIMY, 1))

In [None]:
results = np.empty_like(a_T)
cuda.memcpy_dtoh(results, a_T_gpu)
print(results)
print(np.allclose(np.transpose(a), results, atol=1e-4))

In [None]:
%%time
np.transpose(a)

In [3]:
## ROW MEAN ##
a = np.random.randn(10549, 8982).astype(np.float32)
mean_a = np.zeros([a.shape[0]]).astype(np.float32)
SHAPE_A = np.array(a.shape).astype(np.uint32)

In [4]:
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)
mean_a_gpu = cuda.mem_alloc(mean_a.nbytes)
cuda.memcpy_htod(mean_a_gpu, mean_a)
SHAPE_A_gpu = cuda.mem_alloc(SHAPE_A.nbytes)
cuda.memcpy_htod(SHAPE_A_gpu, SHAPE_A)

In [5]:
func = mod.get_function("row_mean")
BLOCK_DIMX = 1024
GRID_DIMX = int(np.ceil(a.shape[0]/float(BLOCK_DIMX)))
print (GRID_DIMX)

11


In [6]:
%%time
func(a_gpu, mean_a_gpu, SHAPE_A_gpu, block=(BLOCK_DIMX, 1, 1), grid=(GRID_DIMX, 1, 1))

CPU times: user 532 µs, sys: 90 µs, total: 622 µs
Wall time: 626 µs


In [7]:
results = np.empty_like(mean_a)
cuda.memcpy_dtoh(results, mean_a_gpu)
print(results)
print(np.allclose(np.mean(a, axis=1), results, atol=1e-4))

[ 0.01379276 -0.01639137 -0.00990051 ...,  0.00314257  0.0034696
  0.00319359]
True


In [8]:
%%time
np.mean(a, axis=1)

CPU times: user 40.7 ms, sys: 98 µs, total: 40.8 ms
Wall time: 40.8 ms


array([ 0.01379275, -0.01639137, -0.00990053, ...,  0.00314256,
        0.0034696 ,  0.00319358], dtype=float32)

In [9]:
## COLUMN MEAN ##
a = np.random.randn(10549, 8982).astype(np.float32)
mean_a = np.zeros([a.shape[1]]).astype(np.float32)
SHAPE_A = np.array(a.shape).astype(np.uint32)

In [10]:
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)
mean_a_gpu = cuda.mem_alloc(mean_a.nbytes)
cuda.memcpy_htod(mean_a_gpu, mean_a)
SHAPE_A_gpu = cuda.mem_alloc(SHAPE_A.nbytes)
cuda.memcpy_htod(SHAPE_A_gpu, SHAPE_A)

In [11]:
func = mod.get_function("column_mean")
BLOCK_DIMX = 1024
GRID_DIMX = int(np.ceil(a.shape[1]/float(BLOCK_DIMX)))
print (GRID_DIMX)

9


In [12]:
%%time
func(a_gpu, mean_a_gpu, SHAPE_A_gpu, block=(BLOCK_DIMX, 1, 1), grid=(GRID_DIMX, 1, 1))

CPU times: user 55 µs, sys: 8 µs, total: 63 µs
Wall time: 66 µs


In [13]:
results = np.empty_like(mean_a)
cuda.memcpy_dtoh(results, mean_a_gpu)
print(results)
print(np.allclose(np.mean(a, axis=0), results, atol=1e-4))

[ 0.00875998  0.01302794 -0.00799079 ..., -0.01028944 -0.00198964
  0.00089069]
True


In [14]:
%%time
np.mean(a, axis=0)

CPU times: user 37.2 ms, sys: 160 µs, total: 37.4 ms
Wall time: 37.1 ms


array([ 0.00875998,  0.01302794, -0.00799079, ..., -0.01028944,
       -0.00198964,  0.00089069], dtype=float32)

In [15]:
## ARGMIN, MIN ##
a = np.random.randn(10549, 8982).astype(np.float32)
min_a = np.zeros([a.shape[0]]).astype(np.float32)
argmin_a = np.zeros([a.shape[0]]).astype(np.uint32)
SHAPE_A = np.array(a.shape).astype(np.uint32)

In [16]:
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)
min_a_gpu = cuda.mem_alloc(min_a.nbytes)
cuda.memcpy_htod(min_a_gpu, min_a)
argmin_a_gpu = cuda.mem_alloc(argmin_a.nbytes)
cuda.memcpy_htod(argmin_a_gpu, argmin_a)
SHAPE_A_gpu = cuda.mem_alloc(SHAPE_A.nbytes)
cuda.memcpy_htod(SHAPE_A_gpu, SHAPE_A)

In [18]:
func = mod.get_function("min_row")
BLOCK_DIMX = 1024
GRID_DIMX = int(np.ceil(a.shape[0]/float(BLOCK_DIMX)))
print (GRID_DIMX)

11


In [19]:
%%time
func(a_gpu, SHAPE_A_gpu, min_a_gpu, argmin_a_gpu, block=(BLOCK_DIMX, 1, 1), grid=(GRID_DIMX, 1, 1))

CPU times: user 71 µs, sys: 9 µs, total: 80 µs
Wall time: 83 µs


In [20]:
results = np.empty_like(min_a)
cuda.memcpy_dtoh(results, min_a_gpu)
print(results)
print(np.allclose(np.min(a, axis=1), results, atol=1e-4))

[-3.85049248 -3.55566049 -4.19751215 ..., -3.73588705 -4.49926805
 -3.76275706]
True


In [21]:
%%time
np.min(a, axis=1)

CPU times: user 27.5 ms, sys: 0 ns, total: 27.5 ms
Wall time: 26.8 ms


array([-3.85049248, -3.55566049, -4.19751215, ..., -3.73588705,
       -4.49926805, -3.76275706], dtype=float32)

In [22]:
results = np.empty_like(argmin_a)
cuda.memcpy_dtoh(results, argmin_a_gpu)
print(results)
print(np.allclose(np.argmin(a, axis=1), results, atol=1e-4))

[6349  448 1200 ..., 6421 6132 8354]
True


In [23]:
## SUM AXIS##
a = np.random.randn(1054, 89,45).astype(np.float32)
sum3_a = np.zeros([a.shape[0], a.shape[1]]).astype(np.float32)
sum2_a = np.zeros([a.shape[0], a.shape[2]]).astype(np.float32)
sum1_a = np.zeros([a.shape[1], a.shape[2]]).astype(np.float32)
SHAPE_A = np.array(a.shape).astype(np.uint32)

In [24]:
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)

sum3_a_gpu = cuda.mem_alloc(sum3_a.nbytes)
cuda.memcpy_htod(sum3_a_gpu, sum3_a)
sum2_a_gpu = cuda.mem_alloc(sum2_a.nbytes)
cuda.memcpy_htod(sum2_a_gpu, sum2_a)
sum1_a_gpu = cuda.mem_alloc(sum1_a.nbytes)
cuda.memcpy_htod(sum1_a_gpu, sum1_a)

SHAPE_A_gpu = cuda.mem_alloc(SHAPE_A.nbytes)
cuda.memcpy_htod(SHAPE_A_gpu, SHAPE_A)

In [25]:
func = mod.get_function("sum_axis3")
BLOCK_DIMX = 32
BLOCK_DIMY = 32
GRID_DIMX = int(np.ceil(a.shape[1]/float(BLOCK_DIMX)))
GRID_DIMY = int(np.ceil(a.shape[0]/float(BLOCK_DIMY)))
print (GRID_DIMX, GRID_DIMY)

(3, 33)


In [26]:
%%time
func(a_gpu, SHAPE_A_gpu, sum3_a_gpu, block=(BLOCK_DIMX, BLOCK_DIMY, 1), grid=(GRID_DIMX, GRID_DIMY, 1))

CPU times: user 83 µs, sys: 10 µs, total: 93 µs
Wall time: 91.1 µs


In [27]:
results = np.empty_like(sum3_a)
cuda.memcpy_dtoh(results, sum3_a_gpu)
print(results)
print(np.allclose(np.sum(a, axis=2), results, atol=1e-5))

[[  2.76620173   3.46547246  -6.58266258 ...,  12.91075134   9.27414417
   -0.20195746]
 [ -0.71003604   3.96370769   1.51221001 ...,   1.15611005  -3.51532745
    4.20375776]
 [  5.52328873  -3.01439714   9.02602386 ...,  -6.46910334   5.86660385
   -1.304389  ]
 ..., 
 [  1.89996386   5.34373951  -2.49017596 ...,   5.4591713   -3.82690263
    0.73662424]
 [ 13.79426289   4.93118811  -2.96993637 ...,  -0.7317217  -11.43707466
    4.18983364]
 [ -0.17461634  -8.47156334  -4.94273281 ...,   4.93384695  11.56926537
    5.59148455]]
True


In [28]:
%%time
np.sum(a, axis=2)

CPU times: user 4.57 ms, sys: 70 µs, total: 4.64 ms
Wall time: 4.11 ms


array([[  2.76620221,   3.46547151,  -6.58266258, ...,  12.91075134,
          9.27414417,  -0.20195735],
       [ -0.71003652,   3.96370864,   1.51220906, ...,   1.15611076,
         -3.51532745,   4.20375776],
       [  5.52328873,  -3.01439714,   9.026021  , ...,  -6.46910286,
          5.86660433,  -1.30438852],
       ..., 
       [  1.89996374,   5.34373856,  -2.49017549, ...,   5.45917034,
         -3.82690263,   0.73662472],
       [ 13.79426193,   4.93118811,  -2.96993589, ...,  -0.73172122,
        -11.43707466,   4.18983078],
       [ -0.17461705,  -8.47156334,  -4.94273329, ...,   4.93384743,
         11.56926346,   5.59148455]], dtype=float32)

In [29]:
func = mod.get_function("sum_axis2")
BLOCK_DIMX = 32
BLOCK_DIMY = 32
GRID_DIMX = int(np.ceil(a.shape[2]/float(BLOCK_DIMX)))
GRID_DIMY = int(np.ceil(a.shape[0]/float(BLOCK_DIMY)))
print (GRID_DIMX, GRID_DIMY)

(2, 33)


In [30]:
%%time
func(a_gpu, SHAPE_A_gpu, sum2_a_gpu, block=(BLOCK_DIMX, BLOCK_DIMY, 1), grid=(GRID_DIMX, GRID_DIMY, 1))

CPU times: user 62 µs, sys: 8 µs, total: 70 µs
Wall time: 66 µs


In [31]:
results = np.empty_like(sum2_a)
cuda.memcpy_dtoh(results, sum2_a_gpu)
print(results)
print(np.allclose(np.sum(a, axis=1), results, atol=1e-5))

[[-10.35563946  -2.84267688  -4.73142624 ...,   9.26231384   8.08987999
   -5.98818731]
 [  5.91100359   6.87636423   4.91737843 ..., -10.0587616    3.02491474
   -8.86170101]
 [  4.76445818  -4.81286144  -2.51421356 ...,  -9.17828274  10.49326992
    5.56969309]
 ..., 
 [ 13.83674335 -12.90276909   9.26255608 ..., -11.63795757  17.81700706
   -0.82999754]
 [  9.45636177   9.15571785  17.81391525 ...,  -1.41644168   3.24659348
   -5.94546223]
 [  4.4145093   21.88774109   5.29872561 ...,   1.59486461  -2.7983191
   12.5598917 ]]
True


In [32]:
%%time
np.sum(a, axis=1)

CPU times: user 4.22 ms, sys: 27 µs, total: 4.25 ms
Wall time: 3.88 ms


array([[-10.35563946,  -2.84267688,  -4.73142624, ...,   9.26231384,
          8.08987999,  -5.98818731],
       [  5.91100359,   6.87636423,   4.91737843, ..., -10.0587616 ,
          3.02491474,  -8.86170101],
       [  4.76445818,  -4.81286144,  -2.51421356, ...,  -9.17828274,
         10.49326992,   5.56969309],
       ..., 
       [ 13.83674335, -12.90276909,   9.26255608, ..., -11.63795757,
         17.81700706,  -0.82999754],
       [  9.45636177,   9.15571785,  17.81391525, ...,  -1.41644168,
          3.24659348,  -5.94546223],
       [  4.4145093 ,  21.88774109,   5.29872561, ...,   1.59486461,
         -2.7983191 ,  12.5598917 ]], dtype=float32)

In [33]:
func = mod.get_function("sum_axis1")
BLOCK_DIMX = 32
BLOCK_DIMY = 32
GRID_DIMX = int(np.ceil(a.shape[2]/float(BLOCK_DIMX)))
GRID_DIMY = int(np.ceil(a.shape[1]/float(BLOCK_DIMY)))
print (GRID_DIMX, GRID_DIMY)

(2, 3)


In [34]:
%%time
func(a_gpu, SHAPE_A_gpu, sum1_a_gpu, block=(BLOCK_DIMX, BLOCK_DIMY, 1), grid=(GRID_DIMX, GRID_DIMY, 1))

CPU times: user 60 µs, sys: 8 µs, total: 68 µs
Wall time: 68.9 µs


In [35]:
results = np.empty_like(sum1_a)
cuda.memcpy_dtoh(results, sum1_a_gpu)
print(results)
print(np.allclose(np.sum(a, axis=0), results, atol=1e-5))

[[-73.16607666  27.83765411  20.21465492 ..., -23.16413307  71.95492554
  -48.57486343]
 [ 63.09356689 -55.15248108  72.82141876 ..., -30.33496284 -79.31492615
  -47.61728668]
 [ 49.15576172  28.31169701 -13.40281773 ...,  40.66150665 -65.16136932
    2.33651495]
 ..., 
 [  3.4313221   12.96168423  13.49198818 ..., -27.3525219   39.18828583
   12.34987354]
 [ 56.69228745 -24.02456284  -3.21201944 ...,   8.04820061  40.63549042
   35.67337036]
 [  7.17916489 -11.18802643 -32.46915054 ...,  24.59671783  26.51755524
   10.77449417]]
True


In [36]:
%%time
np.sum(a, axis=0)

CPU times: user 2.49 ms, sys: 314 µs, total: 2.81 ms
Wall time: 2 ms


array([[-73.16607666,  27.83765411,  20.21465492, ..., -23.16413307,
         71.95492554, -48.57486343],
       [ 63.09356689, -55.15248108,  72.82141876, ..., -30.33496284,
        -79.31492615, -47.61728668],
       [ 49.15576172,  28.31169701, -13.40281773, ...,  40.66150665,
        -65.16136932,   2.33651495],
       ..., 
       [  3.4313221 ,  12.96168423,  13.49198818, ..., -27.3525219 ,
         39.18828583,  12.34987354],
       [ 56.69228745, -24.02456284,  -3.21201944, ...,   8.04820061,
         40.63549042,  35.67337036],
       [  7.17916489, -11.18802643, -32.46915054, ...,  24.59671783,
         26.51755524,  10.77449417]], dtype=float32)

In [37]:
## MU_DIFF#
a = np.random.randn(1054, 89).astype(np.float32)
mu = np.random.randn(25, 89).astype(np.float32)
SHAPE_A = np.array(a.shape).astype(np.uint32)
SHAPE_MU = np.array(mu.shape).astype(np.uint32)
argmin = np.zeros(a.shape[0]).astype(np.uint32)

In [38]:
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)

mu_gpu = cuda.mem_alloc(mu.nbytes)
cuda.memcpy_htod(mu_gpu, mu)

SHAPE_A_gpu = cuda.mem_alloc(SHAPE_A.nbytes)
cuda.memcpy_htod(SHAPE_A_gpu, SHAPE_A)

SHAPE_MU_gpu = cuda.mem_alloc(SHAPE_MU.nbytes)
cuda.memcpy_htod(SHAPE_MU_gpu, SHAPE_MU)

argmin_gpu = cuda.mem_alloc(argmin.nbytes)
cuda.memcpy_htod(argmin_gpu, argmin)

In [39]:
func = mod.get_function("argmin_mu_diff")
BLOCK_DIMX = 1024
GRID_DIMX = int(np.ceil(a.shape[0]/float(BLOCK_DIMX)))
print (GRID_DIMX)

2


In [40]:
%%time
func(a_gpu, mu_gpu, SHAPE_A_gpu,SHAPE_MU_gpu, argmin_gpu, block=(BLOCK_DIMX, 1, 1), grid=(GRID_DIMX, 1, 1))

CPU times: user 66 µs, sys: 8 µs, total: 74 µs
Wall time: 70.1 µs


In [41]:
results = np.empty_like(argmin)
cuda.memcpy_dtoh(results, argmin_gpu)
print(results)


[ 3 17 16 ..., 18 18  7]


In [42]:
%%time
ans = np.argmin(np.sum(np.square(a[:, None, :] - mu[None, :, :]), axis=-1), axis=-1) # n,k,d

CPU times: user 10.1 ms, sys: 0 ns, total: 10.1 ms
Wall time: 9.52 ms


In [43]:
print(np.allclose(ans, results, atol=1e-5))

True
