In [1]:
#!pip install pycuda

In [2]:
%%writefile show_mtx.cu

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>


extern "C" __global__ void show_mtx_ker(float * data_mtx, int dim_x, int dim_y)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    printf("Hello world from tid %d, thread %d, in block %d!\n", tid, threadIdx.x, blockIdx.x);
	  __syncthreads();

    //assign this thread tid to an [i,j] element of the data_mtx
    // each thread will print one element
    if ( tid < dim_x*dim_y ) //we don't care about threads with larger tid
    {   
        int i = tid % dim_x;
        int j = tid / dim_x;
        float val = data_mtx[tid];
        printf("tid: %d, i: %d, j: %d, mtx[i,j]: %f\n", tid,i, j, val);              
    }
    
    return;
}


Overwriting show_mtx.cu


In [3]:
!nvcc -ptx -o show_mtx.ptx show_mtx.cu

In [4]:
%%writefile test.py

import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
from time import time

import numpy as np

dim_x=7
dim_y=5

data_mtx=np.float32(np.arange(dim_x*dim_y).reshape([dim_y,dim_x]))

print('data_mtx:')
print(data_mtx)
print('-----')

data_gpu=gpuarray.to_gpu(data_mtx)

my_mod = pycuda.driver.module_from_file('./show_mtx.ptx')
show_mtx_ker = my_mod.get_function('show_mtx_ker')

blocksize = 10
gridsize = int(np.ceil(dim_x*dim_y / blocksize))

print(f'block size: {blocksize}')
print(f'grid size: {gridsize}')
print('-----')

show_mtx_ker(data_gpu, np.int32(dim_x), np.int32(dim_y), grid=(gridsize, 1, 1), block=(blocksize,1,1))


Overwriting test.py


In [5]:
!python test.py

data_mtx:
[[ 0.  1.  2.  3.  4.  5.  6.]
 [ 7.  8.  9. 10. 11. 12. 13.]
 [14. 15. 16. 17. 18. 19. 20.]
 [21. 22. 23. 24. 25. 26. 27.]
 [28. 29. 30. 31. 32. 33. 34.]]
-----
block size: 10
grid size: 4
-----
Hello world from tid 10, thread 0, in block 1!
Hello world from tid 11, thread 1, in block 1!
Hello world from tid 12, thread 2, in block 1!
Hello world from tid 13, thread 3, in block 1!
Hello world from tid 14, thread 4, in block 1!
Hello world from tid 15, thread 5, in block 1!
Hello world from tid 16, thread 6, in block 1!
Hello world from tid 17, thread 7, in block 1!
Hello world from tid 18, thread 8, in block 1!
Hello world from tid 19, thread 9, in block 1!
Hello world from tid 20, thread 0, in block 2!
Hello world from tid 21, thread 1, in block 2!
Hello world from tid 22, thread 2, in block 2!
Hello world from tid 23, thread 3, in block 2!
Hello world from tid 24, thread 4, in block 2!
Hello world from tid 25, thread 5, in block 2!
Hello world from tid 26, thread 6, in bloc