In [1]:
!nvidia-smi

Sun Mar 26 06:27:20 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   61C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# !pip install pycuda

In [3]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda import gpuarray
from pycuda.compiler import SourceModule

In [4]:
ker = SourceModule(no_extern_c=True ,source='''
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>
#define TILE_SIZE 2

//Matrix multiplication using non shared kernel

extern "C" {
__global__ void gpu_Matrix_Mul_nonshared(float *d_a, float *d_b, float *d_c, const int size)
{
	int row, col;
	col = TILE_SIZE * blockIdx.x + threadIdx.x;
	row = TILE_SIZE * blockIdx.y + threadIdx.y;

	for (int k = 0; k< size; k++)
	{
		d_c[row*size + col] += d_a[row * size + k] * d_b[k * size + col];
	}
}

// Matrix multiplication using shared kernel
__global__ void gpu_Matrix_Mul_shared(float *d_a, float *d_b, float *d_c, const int size)
{
	int row, col;
	//Defining Shared Memory
	__shared__ float shared_a[TILE_SIZE][TILE_SIZE];
	__shared__ float shared_b[TILE_SIZE][TILE_SIZE];
	col = TILE_SIZE * blockIdx.x + threadIdx.x;
	row = TILE_SIZE * blockIdx.y + threadIdx.y;

	for (int i = 0; i< size / TILE_SIZE; i++) 
	{
		shared_a[threadIdx.y][threadIdx.x] = d_a[row* size + (i*TILE_SIZE + threadIdx.x)];
		shared_b[threadIdx.y][threadIdx.x] = d_b[(i*TILE_SIZE + threadIdx.y) * size + col];
		__syncthreads(); 
		for (int j = 0; j<TILE_SIZE; j++)
			d_c[row*size + col] += shared_a[threadIdx.y][j] * shared_b[j][threadIdx.x];
		__syncthreads(); 

	}
}


}// (End of 'extern "C"' here)
''')

In [5]:
test_a = np.float32([np.arange(1,5)] * 4)
test_b = np.float32([np.arange(14,10, -1)]*4 )

In [6]:
size = 4

In [7]:
test_a=np.float32(10*np.random.rand(size,size))
test_b=np.float32(10*np.random.rand(size,size))

test_a_gpu = gpuarray.to_gpu(test_a)
test_b_gpu = gpuarray.to_gpu(test_b)
test_results_gpu = gpuarray.empty_like(test_a_gpu)


In [8]:
print(test_a)
print(test_b)
print(test_results_gpu)

[[8.674211   2.1706622  4.245011   5.0828986 ]
 [4.9376445  1.7164611  2.822008   5.896873  ]
 [4.8189936  5.949731   6.4991546  5.9674377 ]
 [8.107354   4.5761924  4.3101206  0.30929336]]
[[7.6307817 4.663706  2.4389946 2.6522996]
 [5.6197724 1.3444306 6.5065374 0.6455543]
 [5.9780693 4.311874  7.865382  9.994088 ]
 [8.475948  4.3961754 1.2009869 9.936801 ]]
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [9]:
#mult_ker = ker.get_function("gpu_Matrix_Mul_nonshared")
mult_ker = ker.get_function("gpu_Matrix_Mul_shared")

In [10]:
TILE_SIZE = 2
dimGrid=(size // TILE_SIZE, size // TILE_SIZE, 1)
dimBlock=(TILE_SIZE, TILE_SIZE, 1)

print(dimGrid)
print(dimBlock)

(2, 2, 1)
(2, 2, 1)


In [11]:
mult_ker(test_a_gpu, test_b_gpu, test_results_gpu, np.int32(size), block=dimBlock, grid=dimGrid)

In [12]:
test_results_gpu.get()

array([[146.84898 ,  84.02154 ,  74.77297 , 117.34065 ],
       [114.17596 ,  63.42721 ,  52.48934 , 101.00363 ],
       [159.6409  ,  84.73081 , 108.75079 , 140.87265 ],
       [115.97036 ,  63.907093,  83.82117 ,  70.60642 ]], dtype=float32)

In [13]:
np.matmul(test_a,test_b)

array([[146.84898 ,  84.02154 ,  74.77297 , 117.34065 ],
       [114.17596 ,  63.42721 ,  52.48934 , 101.00363 ],
       [159.6409  ,  84.73081 , 108.75079 , 140.87265 ],
       [115.97036 ,  63.907093,  83.82117 ,  70.60642 ]], dtype=float32)