In [None]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

In [2]:
import numpy as np
a = np.arange(4*4).reshape(4,4).astype(np.int64)
r = np.random.randint(np.iinfo(np.int32).max, size=[4, 4]).astype(np.int32)
r

array([[2146318115, 1807009153, 1561418033, 1053861022],
       [1018657457,  160920048, 1294831761,  149078618],
       [ 821953232,  910358515, 1464311275,  169963125],
       [1407982476, 1617435701,  685769209, 2016026574]], dtype=int32)

In [3]:
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)

In [4]:
mod_src = """
__global__ void doublify(long *a, int *r, int *shape)
{
  int row_len = shape[1];
  int i_last = row_len - 2;
  int i, j, idx1, idx2;
  long tmp;
  int n_rows = shape[0];
  int row_n = blockIdx.x * blockDim.x + threadIdx.x;
  if (row_n > n_rows-1) return;
  int row_start_idx = row_n * row_len;
  for (i = 0; i <= i_last; i++) {
    idx1 = row_start_idx + i;
    j = r[idx1] % (row_len - i) + i;
    idx2 = row_start_idx + j;
    
    tmp = a[idx1];
    a[idx1] = a[idx2];
    a[idx2] = tmp;
  }
}
"""
mod = SourceModule(mod_src)
func = mod.get_function("doublify")

In [5]:
func(a_gpu, cuda.In(r), cuda.In(np.array(a.shape, dtype=np.int32)), block=(256,1,1), grid=(2,1,1))
a_doubled = np.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print(a_doubled)
print(a)

[[ 3  2  0  1]
 [ 5  4  7  6]
 [ 8 10 11  9]
 [12 15 13 14]]
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]


In [6]:
def shuffle_row_wise_cuda(a):
    %time r = np.random.randint(np.iinfo(np.int32).max, size=a.shape).astype(np.int32) # extra col to simplify kernel
    shape_arr = np.array(a.shape, dtype=np.int32)
    block_size = cuda.Device(0).get_attributes()[cuda.device_attribute.MAX_BLOCK_DIM_X]
    num_blocks = int(np.ceil(a.shape[0] / block_size))
    
    a_gpu = cuda.mem_alloc(a.nbytes)
    cuda.memcpy_htod(a_gpu, a)
    r_gpu = cuda.mem_alloc(r.nbytes)
    cuda.memcpy_htod(r_gpu, r)
    shape_gpu = cuda.mem_alloc(shape_arr.nbytes)
    cuda.memcpy_htod(shape_gpu, shape_arr)
    
    %time func(a_gpu, r_gpu, shape_gpu, block=(block_size, 1, 1), grid=(num_blocks, 1, 1))
    
    a2 = np.empty_like(a)
    cuda.memcpy_dtoh(a2, a_gpu)
    return a2

def shuffle_row_wise_numpy(a):
    for i in range(a.shape[0]):
        np.random.shuffle(a[i])

In [None]:
arr = np.arange(5, dtype=np.int64).reshape(1, -1).repeat(50000, axis=0)
print(arr[0])
%time arr2 = shuffle_row_wise_cuda(arr)
print(arr2)

%time shuffle_row_wise_numpy(arr)
print(arr)

In [None]:
import pycuda.autoinit
import pycuda.driver as cuda

(free,total)=cuda.mem_get_info()
print("Global memory occupancy:%f%% free"%(free*100/total))

for devicenum in range(cuda.Device.count()):
    device=cuda.Device(devicenum)
    attrs=device.get_attributes()

    #Beyond this point is just pretty printing
    print("\n===Attributes for device %d"%devicenum)
    for (key,value) in attrs.items():
        print("%s:%s"%(str(key),str(value)))