In [120]:
#!pip install pycuda

In [121]:
%%writefile show_mtx.cu

#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>


extern "C" __global__ void show_mtx_ker(float * data_mtx, int dim_x, int dim_y, float * centers_mtx, int n_clusters, float * output_mtx, float * output_labels)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    printf("Hello world from tid %d, thread %d, in block %d!\n", tid, threadIdx.x, blockIdx.x);
	  __syncthreads();

    //assign this thread tid to the ith column of the data_mtx
    // for all columns of the data mtx do this:
    if ( tid < dim_x ) //we don't care about threads with larger tid
    {   
        int col = tid;
        float current_best_val=-1 ;
          
        for (int k = 0; k< n_clusters; k++)
        { 
          float sq_dist_k=0 ;
          for (int row = 0; row< dim_y; row++)
          {
            float val_data_mtx = data_mtx[col+row*dim_x];
            float val_centers_mtx = centers_mtx[k+row*n_clusters];
            float diff = val_data_mtx-val_centers_mtx;
            sq_dist_k=sq_dist_k+diff*diff ;
            //printf("tid: %d, col: %d, k: %d, row: %d, val_data_mtx: %f, val_centers_mtx: %f\n", tid, col, k, row, val_data_mtx, val_centers_mtx); 
          }
          output_mtx[col+k*dim_x]=sq_dist_k;   
          if (current_best_val< 0)
          {
            current_best_val=sq_dist_k;
            output_labels[tid] = k;
          }       
          else if (current_best_val>sq_dist_k)
          {
              output_labels[tid]=k;
          }
          printf("tid: %d, col: %d, k: %d, sq_dist_k: %f\n", tid, col, k, sq_dist_k); 
        }
                     
        
    }
    
    return;
}


Overwriting show_mtx.cu


In [122]:
!nvcc -ptx -o show_mtx.ptx show_mtx.cu

In [123]:
%%writefile test.py

import pycuda.autoinit
import pycuda.driver as drv
from pycuda import gpuarray
from pycuda.compiler import SourceModule
from time import time
from sklearn.datasets import make_blobs

import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')

import numpy as np

n_samples=20
n_clusters=3
n_features =2

dim_x= n_samples
dim_y=n_features

# Creating a n_features=2 Dim dataset with centers=3 clusters
np.random.seed(150)

Xs, Ys = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters)

#data_mtx=np.float32(np.random.randn(dim_x*dim_y).reshape([dim_y,dim_x]))
#data_mtx=np.float32(np.arange(dim_x*dim_y).reshape([dim_y,dim_x]))
data_mtx=np.float32(Xs.T.copy())
centers_mtx = data_mtx[:,:n_clusters]

output_mtx=np.float32(np.zeros([n_clusters,n_samples]))
output_labels=np.float32(-np.ones(n_samples))

print(f' data mtx shape: {data_mtx.shape}')
print(f' centers_mtx shape: {centers_mtx.shape}')

print('data_mtx:')
print(data_mtx)
print('-----')

print('centers_mtx:')
print(centers_mtx)
print('-----')

data_gpu=gpuarray.to_gpu(data_mtx)

print(data_gpu.shape)
centers_mtx_gpu=gpuarray.to_gpu(centers_mtx)
output_mtx_gpu = gpuarray.to_gpu(output_mtx)
output_labels_gpu= gpuarray.to_gpu(output_labels)

my_mod = pycuda.driver.module_from_file('./show_mtx.ptx')
show_mtx_ker = my_mod.get_function('show_mtx_ker')

blocksize = 10
gridsize = int(np.ceil(dim_x*dim_y / blocksize))

print(f'block size: {blocksize}')
print(f'grid size: {gridsize}')
print('-----')

show_mtx_ker(data_gpu, np.int32(dim_x), np.int32(dim_y), centers_mtx_gpu, np.int32(n_clusters), output_mtx_gpu, output_labels_gpu, grid=(gridsize, 1, 1), block=(blocksize,1,1))

results=output_mtx_gpu.get()
print(results.T)

results=output_labels_gpu.get()
print(results.T)

#---------------
print('verification:')
output_mtx_cpu=np.float32(np.zeros([n_clusters,n_samples]))
for i in np.arange(n_samples):
  for k in np.arange(n_clusters):
    val=data_mtx[:,i]-centers_mtx[:,k]
    output_mtx_cpu[k,i]=np.inner(val,val)
print(output_mtx_cpu.T)


Overwriting test.py


In [124]:
!python test.py

 data mtx shape: (2, 20)
 centers_mtx shape: (2, 3)
data_mtx:
[[ 3.663308    7.3695297   6.819901    8.437142    4.7248526   8.923332
   7.799213    7.479743    5.251107    3.6881251  10.128995    4.195058
   8.474582    4.878855    3.6564035   8.111536   10.864538    8.075478
   8.0367365   6.803934  ]
 [-0.07746135  4.6528845   3.0645247   5.6480713   1.6812141   4.778726
  -3.1128001   2.6383731   1.5249537  -0.7423982  -6.306928    0.27397767
  -5.6713934   5.57361     1.0146273  -4.934861   -6.2164364   3.518177
  -6.790291   -5.4330854 ]]
-----
centers_mtx:
[[ 3.663308    7.3695297   6.819901  ]
 [-0.07746135  4.6528845   3.0645247 ]]
-----
(2, 20)
block size: 10
grid size: 4
-----
Hello world from tid 10, thread 0, in block 1!
Hello world from tid 11, thread 1, in block 1!
Hello world from tid 12, thread 2, in block 1!
Hello world from tid 13, thread 3, in block 1!
Hello world from tid 14, thread 4, in block 1!
Hello world from tid 15, thread 5, in block 1!
Hello world from tid 